Skip to content

Commit

Permalink
improve testing of route failover logic
Browse files Browse the repository at this point in the history
Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
  • Loading branch information
kradalby authored and juanfont committed Apr 15, 2024
1 parent bf4fd07 commit 1704977
Show file tree
Hide file tree
Showing 11 changed files with 518 additions and 143 deletions.
71 changes: 42 additions & 29 deletions hscontrol/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ func (h *Headscale) createRouter(grpcMux *grpcRuntime.ServeMux) *mux.Router {
return router
}

// Serve launches a GIN server with the Headscale API.
// Serve launches the HTTP and gRPC server service Headscale and the API.
func (h *Headscale) Serve() error {
if _, enableProfile := os.LookupEnv("HEADSCALE_PROFILING_ENABLED"); enableProfile {
if profilePath, ok := os.LookupEnv("HEADSCALE_PROFILING_PATH"); ok {
Expand Down Expand Up @@ -532,7 +532,7 @@ func (h *Headscale) Serve() error {

region, err := h.DERPServer.GenerateRegion()
if err != nil {
return err
return fmt.Errorf("generating DERP region for embedded server: %w", err)
}

if h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion {
Expand Down Expand Up @@ -607,14 +607,14 @@ func (h *Headscale) Serve() error {
}...,
)
if err != nil {
return err
return fmt.Errorf("setting up gRPC gateway via socket: %w", err)
}

// Connect to the gRPC server over localhost to skip
// the authentication.
err = v1.RegisterHeadscaleServiceHandler(ctx, grpcGatewayMux, grpcGatewayConn)
if err != nil {
return err
return fmt.Errorf("registering Headscale API service to gRPC: %w", err)
}

// Start the local gRPC server without TLS and without authentication
Expand All @@ -635,9 +635,7 @@ func (h *Headscale) Serve() error {

tlsConfig, err := h.getTLSSettings()
if err != nil {
log.Error().Err(err).Msg("Failed to set up TLS configuration")

return err
return fmt.Errorf("configuring TLS settings: %w", err)
}

//
Expand Down Expand Up @@ -702,15 +700,11 @@ func (h *Headscale) Serve() error {
httpServer := &http.Server{
Addr: h.cfg.Addr,
Handler: router,
ReadTimeout: types.HTTPReadTimeout,
// Go does not handle timeouts in HTTP very well, and there is
// no good way to handle streaming timeouts, therefore we need to
// keep this at unlimited and be careful to clean up connections
// https://blog.cloudflare.com/the-complete-guide-to-golang-net-http-timeouts/#aboutstreaming
// TODO(kradalby): this timeout can now be set per handler with http.ResponseController:
// https://www.alexedwards.net/blog/how-to-use-the-http-responsecontroller-type
// replace this so only the longpoller has no timeout.
WriteTimeout: 0,
ReadTimeout: types.HTTPTimeout,

// Long polling should not have any timeout, this is overriden
// further down the chain
WriteTimeout: types.HTTPTimeout,
}

var httpListener net.Listener
Expand All @@ -729,27 +723,46 @@ func (h *Headscale) Serve() error {
log.Info().
Msgf("listening and serving HTTP on: %s", h.cfg.Addr)

promMux := http.NewServeMux()
promMux.Handle("/metrics", promhttp.Handler())
debugMux := http.NewServeMux()
debugMux.HandleFunc("/debug/notifier", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(h.nodeNotifier.String()))

promHTTPServer := &http.Server{
return
})
debugMux.HandleFunc("/debug/mapresp", func(w http.ResponseWriter, r *http.Request) {
h.mapSessionMu.Lock()
defer h.mapSessionMu.Unlock()

var b strings.Builder
b.WriteString("mapresponders:\n")
for k, v := range h.mapSessions {
fmt.Fprintf(&b, "\t%d: %p\n", k, v)
}

w.WriteHeader(http.StatusOK)
w.Write([]byte(b.String()))

return
})
debugMux.Handle("/metrics", promhttp.Handler())

debugHTTPServer := &http.Server{
Addr: h.cfg.MetricsAddr,
Handler: promMux,
ReadTimeout: types.HTTPReadTimeout,
Handler: debugMux,
ReadTimeout: types.HTTPTimeout,
WriteTimeout: 0,
}

var promHTTPListener net.Listener
promHTTPListener, err = net.Listen("tcp", h.cfg.MetricsAddr)

debugHTTPListener, err := net.Listen("tcp", h.cfg.MetricsAddr)
if err != nil {
return fmt.Errorf("failed to bind to TCP address: %w", err)
}

errorGroup.Go(func() error { return promHTTPServer.Serve(promHTTPListener) })
errorGroup.Go(func() error { return debugHTTPServer.Serve(debugHTTPListener) })

log.Info().
Msgf("listening and serving metrics on: %s", h.cfg.MetricsAddr)
Msgf("listening and serving debug and metrics on: %s", h.cfg.MetricsAddr)

var tailsqlContext context.Context
if tailsqlEnabled {
Expand Down Expand Up @@ -815,7 +828,7 @@ func (h *Headscale) Serve() error {
context.Background(),
types.HTTPShutdownTimeout,
)
if err := promHTTPServer.Shutdown(ctx); err != nil {
if err := debugHTTPServer.Shutdown(ctx); err != nil {
log.Error().Err(err).Msg("Failed to shutdown prometheus http")
}
if err := httpServer.Shutdown(ctx); err != nil {
Expand All @@ -833,7 +846,7 @@ func (h *Headscale) Serve() error {
}

// Close network listeners
promHTTPListener.Close()
debugHTTPListener.Close()
httpListener.Close()
grpcGatewayConn.Close()

Expand Down Expand Up @@ -898,7 +911,7 @@ func (h *Headscale) getTLSSettings() (*tls.Config, error) {
server := &http.Server{
Addr: h.cfg.TLS.LetsEncrypt.Listen,
Handler: certManager.HTTPHandler(http.HandlerFunc(h.redirect)),
ReadTimeout: types.HTTPReadTimeout,
ReadTimeout: types.HTTPTimeout,
}

go func() {
Expand Down
37 changes: 25 additions & 12 deletions hscontrol/db/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ import (
"errors"
"fmt"
"net/netip"
"sort"

"github.com/juanfont/headscale/hscontrol/policy"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/rs/zerolog/log"
"gorm.io/gorm"
"tailscale.com/util/set"
)

var ErrRouteIsNotAvailable = errors.New("route is not available")
Expand Down Expand Up @@ -402,11 +404,10 @@ func SaveNodeRoutes(tx *gorm.DB, node *types.Node) (bool, error) {
return sendUpdate, nil
}

// FailoverRouteIfAvailable takes a node and checks if the node's route
// currently have a functioning host that exposes the network.
// If it does not, it is failed over to another suitable route if there
// is one.
func FailoverRouteIfAvailable(
// FailoverNodeRoutesIfNeccessary takes a node and checks if the node's route
// need to be failed over to another host.
// If needed, the failover will be attempted.
func FailoverNodeRoutesIfNeccessary(
tx *gorm.DB,
isConnected types.NodeConnectedMap,
node *types.Node,
Expand All @@ -416,8 +417,12 @@ func FailoverRouteIfAvailable(
return nil, nil
}

var changedNodes []types.NodeID
log.Trace().Msgf("NODE ROUTES: %d", len(nodeRoutes))
changedNodes := make(set.Set[types.NodeID])

nodeRouteLoop:
for _, nodeRoute := range nodeRoutes {
log.Trace().Msgf("NODE ROUTE: %d", nodeRoute.ID)
routes, err := getRoutesByPrefix(tx, netip.Prefix(nodeRoute.Prefix))
if err != nil {
return nil, fmt.Errorf("getting routes by prefix: %w", err)
Expand All @@ -427,29 +432,37 @@ func FailoverRouteIfAvailable(
if route.IsPrimary {
// if we have a primary route, and the node is connected
// nothing needs to be done.
if isConnected[route.Node.ID] {
return nil, nil
if conn, ok := isConnected[route.Node.ID]; conn && ok {
continue nodeRouteLoop
}

// if not, we need to failover the route
failover := failoverRoute(isConnected, &route, routes)
if failover != nil {
failover.save(tx)
err := failover.save(tx)
if err != nil {
return nil, fmt.Errorf("saving failover routes: %w", err)
}

changedNodes = append(changedNodes, failover.old.Node.ID, failover.new.Node.ID)
changedNodes.Add(failover.old.Node.ID)
changedNodes.Add(failover.new.Node.ID)

continue nodeRouteLoop
}
}
}
}

chng := changedNodes.Slice()
sort.SliceStable(chng, func(i, j int) bool {
return chng[i] < chng[j]
})

if len(changedNodes) != 0 {
return &types.StateUpdate{
Type: types.StatePeerChanged,
ChangeNodes: changedNodes,
Message: "called from db.FailoverRouteIfAvailable",
ChangeNodes: chng,
Message: "called from db.FailoverNodeRoutesIfNeccessary",
}, nil
}

Expand Down
Loading

0 comments on commit 1704977

Please sign in to comment.