Skip to content

Commit

Permalink
Fix nil-pointer panics from proxycfg package.
Browse files Browse the repository at this point in the history
Prior to this PR, servers / agents would panic and crash if an ingress
or api gateway were configured to use a discovery chain that both:

1. Referenced a peered service
2. Had a mesh gateway mode of local

This could occur, because code for handling upstream watches was shared
between both connect-proxy and the gateways. As a short-term fix, this
PR ensures that the maps are always initialized for these gateway services.

For agentless deployments, this behavior is particularly bad. The server
would essentially get stuck in an endless crash-loop, because it would
attempt to execute proxycfg immediately from its persisted state. To help
prevent this, the PR also wraps the proxycfg execution and service
registration calls with recover statements to ensure that future issues
like this do not put the server into an unrecoverable state.
  • Loading branch information
hashi-derek committed Feb 15, 2023
1 parent fd61605 commit e5adc43
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 0 deletions.
1 change: 1 addition & 0 deletions agent/proxycfg/api_gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func (h *handlerAPIGateway) initialize(ctx context.Context) (ConfigSnapshot, err
snap.APIGateway.WatchedDiscoveryChains = make(map[UpstreamID]context.CancelFunc)
snap.APIGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.APIGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.APIGateway.WatchedUpstreams = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)

Expand Down
1 change: 1 addition & 0 deletions agent/proxycfg/ingress_gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ func (s *handlerIngressGateway) initialize(ctx context.Context) (ConfigSnapshot,
snap.IngressGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.IngressGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.IngressGateway.Listeners = make(map[IngressListenerKey]structs.IngressListener)
snap.IngressGateway.UpstreamPeerTrustBundles = watch.NewMap[string, *pbpeering.PeeringTrustBundle]()
snap.IngressGateway.PeerUpstreamEndpoints = watch.NewMap[UpstreamID, structs.CheckServiceNodes]()
Expand Down
15 changes: 15 additions & 0 deletions agent/proxycfg/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package proxycfg

import (
"errors"
"runtime/debug"
"sync"

"github.com/hashicorp/go-hclog"
Expand Down Expand Up @@ -142,6 +143,20 @@ func (m *Manager) Register(id ProxyID, ns *structs.NodeService, source ProxySour
m.mu.Lock()
defer m.mu.Unlock()

defer func() {
if r := recover(); r != nil {
m.Logger.Error("unexpected panic during service manager registration",
"node", id.NodeName,
"service", id.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()),
)
}
}()
return m.register(id, ns, source, token, overwrite)
}

func (m *Manager) register(id ProxyID, ns *structs.NodeService, source ProxySource, token string, overwrite bool) error {
state, ok := m.proxies[id]
if ok {
if state.source != source && !overwrite {
Expand Down
16 changes: 16 additions & 0 deletions agent/proxycfg/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"net"
"reflect"
"runtime/debug"
"sync/atomic"
"time"

Expand Down Expand Up @@ -298,6 +299,21 @@ func newConfigSnapshotFromServiceInstance(s serviceInstance, config stateConfig)
}

func (s *state) run(ctx context.Context, snap *ConfigSnapshot) {
// Add a recover here so than any panics do not make their way up
// into the server / agent.
defer func() {
if r := recover(); r != nil {
s.logger.Error("unexpected panic while running proxycfg",
"node", s.serviceInstance.proxyID.NodeName,
"service", s.serviceInstance.proxyID.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()))
}
}()
s.unsafeRun(ctx, snap)
}

func (s *state) unsafeRun(ctx context.Context, snap *ConfigSnapshot) {
// Close the channel we return from Watch when we stop so consumers can stop
// watching and clean up their goroutines. It's important we do this here and
// not in Close since this routine sends on this chan and so might panic if it
Expand Down

0 comments on commit e5adc43

Please sign in to comment.