-
Notifications
You must be signed in to change notification settings - Fork 204
Graceful termination for server #794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,7 +95,7 @@ type Proxy struct { | |
| server *server.ProxyServer | ||
| } | ||
|
|
||
| type StopFunc func() | ||
| type StopFunc func(context.Context) error | ||
|
|
||
| func (p *Proxy) Run(o *options.ProxyRunOptions, stopCh <-chan struct{}) error { | ||
| o.Print() | ||
|
|
@@ -145,16 +145,12 @@ func (p *Proxy) Run(o *options.ProxyRunOptions, stopCh <-chan struct{}) error { | |
| if err != nil { | ||
| return fmt.Errorf("failed to run the frontend server: %v", err) | ||
| } | ||
| if frontendStop != nil { | ||
| defer frontendStop() | ||
| } | ||
|
|
||
| klog.V(1).Infoln("Starting agent server for tunnel connections.") | ||
| err = p.runAgentServer(o, p.server) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to run the agent server: %v", err) | ||
| } | ||
| defer p.agentServer.Stop() | ||
|
|
||
| labels, err := util.ParseLabels(o.LeaseLabel) | ||
| if err != nil { | ||
|
|
@@ -182,17 +178,97 @@ func (p *Proxy) Run(o *options.ProxyRunOptions, stopCh <-chan struct{}) error { | |
| if err != nil { | ||
| return fmt.Errorf("failed to run the admin server: %v", err) | ||
| } | ||
| defer p.adminServer.Close() | ||
|
|
||
| klog.V(1).Infoln("Starting health server for healthchecks.") | ||
| err = p.runHealthServer(o, p.server) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to run the health server: %v", err) | ||
| } | ||
| defer p.healthServer.Close() | ||
|
|
||
| <-stopCh | ||
| klog.V(1).Infoln("Shutting down server.") | ||
| klog.V(1).Infoln("Received shutdown signal, initiating graceful shutdown.") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should wrap most of this new code in a if graceful timeout != 0 block. |
||
|
|
||
| // Start graceful shutdown with timeout | ||
| shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), o.GracefulShutdownTimeout) | ||
| defer shutdownCancel() | ||
|
|
||
| // Create a WaitGroup to track shutdown of all components | ||
| var wg sync.WaitGroup | ||
|
|
||
| // Shutdown frontend server gracefully (if available) | ||
| if frontendStop != nil { | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| klog.V(1).Infoln("Gracefully stopping frontend server...") | ||
| if err := frontendStop(shutdownCtx); err != nil { | ||
| klog.ErrorS(err, "failed to shut down frontend server") | ||
| } else { | ||
| klog.V(1).Infoln("frontend server stopped.") | ||
| } | ||
| }() | ||
| } | ||
|
|
||
| // Shutdown agent server gracefully | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| klog.V(1).Infoln("Gracefully stopping agent server...") | ||
| p.agentServer.GracefulStop() | ||
| klog.V(1).Infoln("agent server stopped.") | ||
| }() | ||
|
|
||
| // Shutdown admin server gracefully | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| klog.V(1).Infoln("Gracefully stopping admin server...") | ||
| if err := p.adminServer.Shutdown(shutdownCtx); err != nil { | ||
| klog.ErrorS(err, "failed to shut down admin server") | ||
|
|
||
| } else { | ||
| klog.V(1).Infoln("admin server stopped.") | ||
| } | ||
| }() | ||
|
|
||
| // Shutdown health server gracefully | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| klog.V(1).Infoln("Gracefully stopping health server...") | ||
| if err := p.healthServer.Shutdown(shutdownCtx); err != nil { | ||
| klog.ErrorS(err, "failed to shut down health server") | ||
| } else { | ||
| klog.V(1).Infoln("health server stopped.") | ||
| } | ||
| }() | ||
|
|
||
| // Wait for all servers to shutdown or timeout | ||
| shutdownComplete := make(chan struct{}) | ||
| go func() { | ||
| wg.Wait() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we know what happens if the other goroutined have completed before this Wait is called? My guess is that is will never trigger. I think it makes more sense to do the non-optional adds. Kick off this goroutines, then kick off the workers. |
||
| close(shutdownComplete) | ||
| }() | ||
|
|
||
| select { | ||
| case <-shutdownComplete: | ||
| klog.V(1).Infoln("Graceful shutdown completed successfully.") | ||
| case <-shutdownCtx.Done(): | ||
| klog.Warningf("Graceful shutdown timed out after %v, forcing termination.", o.GracefulShutdownTimeout) | ||
| // Force stop all servers that might still be running | ||
| if p.agentServer != nil { | ||
| p.agentServer.Stop() | ||
| } | ||
| if p.adminServer != nil { | ||
| p.adminServer.Close() | ||
| } | ||
| if p.healthServer != nil { | ||
| p.healthServer.Close() | ||
| } | ||
| // frontend server's force-stop is handled by its StopFunc | ||
| } | ||
|
|
||
| klog.V(1).Infoln("Server shutdown complete.") | ||
|
|
||
| return nil | ||
| } | ||
|
|
@@ -260,7 +336,10 @@ func (p *Proxy) runUDSFrontendServer(ctx context.Context, o *options.ProxyRunOpt | |
| "udsFile", o.UdsName, | ||
| ) | ||
| go runpprof.Do(context.Background(), labels, func(context.Context) { grpcServer.Serve(lis) }) | ||
| stop = grpcServer.GracefulStop | ||
| stop = func(_ context.Context) error { | ||
| grpcServer.GracefulStop() | ||
| return nil | ||
| } | ||
| } else { | ||
| // http-connect | ||
| server := &http.Server{ | ||
|
|
@@ -269,9 +348,8 @@ func (p *Proxy) runUDSFrontendServer(ctx context.Context, o *options.ProxyRunOpt | |
| Server: s, | ||
| }, | ||
| } | ||
| stop = func() { | ||
| err := server.Shutdown(ctx) | ||
| klog.ErrorS(err, "error shutting down server") | ||
| stop = func(shutdownCtx context.Context) error { | ||
| return server.Shutdown(shutdownCtx) | ||
| } | ||
| labels := runpprof.Labels( | ||
| "core", "udsHttpFrontend", | ||
|
|
@@ -329,7 +407,7 @@ func (p *Proxy) getTLSConfig(caFile, certFile, keyFile string, cipherSuites []st | |
| return tlsConfig, nil | ||
| } | ||
|
|
||
| func (p *Proxy) runMTLSFrontendServer(ctx context.Context, o *options.ProxyRunOptions, s *server.ProxyServer) (StopFunc, error) { | ||
| func (p *Proxy) runMTLSFrontendServer(_ context.Context, o *options.ProxyRunOptions, s *server.ProxyServer) (StopFunc, error) { | ||
| var stop StopFunc | ||
|
|
||
| var tlsConfig *tls.Config | ||
|
|
@@ -356,7 +434,10 @@ func (p *Proxy) runMTLSFrontendServer(ctx context.Context, o *options.ProxyRunOp | |
| "port", strconv.Itoa(o.ServerPort), | ||
| ) | ||
| go runpprof.Do(context.Background(), labels, func(context.Context) { grpcServer.Serve(lis) }) | ||
| stop = grpcServer.GracefulStop | ||
| stop = func(_ context.Context) error { | ||
| grpcServer.GracefulStop() | ||
| return nil | ||
| } | ||
| } else { | ||
| // http-connect | ||
| server := &http.Server{ | ||
|
|
@@ -368,11 +449,8 @@ func (p *Proxy) runMTLSFrontendServer(ctx context.Context, o *options.ProxyRunOp | |
| }, | ||
| TLSNextProto: make(map[string]func(*http.Server, *tls.Conn, http.Handler)), | ||
| } | ||
| stop = func() { | ||
| err := server.Shutdown(ctx) | ||
| if err != nil { | ||
| klog.ErrorS(err, "failed to shutdown server") | ||
| } | ||
| stop = func(shutdownCtx context.Context) error { | ||
| return server.Shutdown(shutdownCtx) | ||
| } | ||
| labels := runpprof.Labels( | ||
| "core", "mtlsHttpFrontend", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like the feature but I think we need to be backward compatible. We should default to 0 which means shutdown immediately and is the current behavior.