Skip to content

Commit

Permalink
Improve logs when there is a timeout error (#1946)
Browse files Browse the repository at this point in the history
The agent logs will print the failure message when the connection to
the controller is timeout. Also adding readiness probe to remind
the users of the connectivity issue.

Fixes #822
  • Loading branch information
hty690 authored Mar 18, 2021
1 parent a9adf5a commit c086437
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
14 changes: 12 additions & 2 deletions pkg/agent/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"fmt"
"io/ioutil"
"net"
"net/http"
"os"
"path"

Expand All @@ -28,6 +29,7 @@ import (
k8sversion "k8s.io/apimachinery/pkg/version"
"k8s.io/apiserver/pkg/registry/rest"
genericapiserver "k8s.io/apiserver/pkg/server"
"k8s.io/apiserver/pkg/server/healthz"
genericoptions "k8s.io/apiserver/pkg/server/options"

"github.com/vmware-tanzu/antrea/pkg/agent/apiserver/handlers/addressgroup"
Expand Down Expand Up @@ -93,7 +95,7 @@ func installAPIGroup(s *genericapiserver.GenericAPIServer, aq agentquerier.Agent
// New creates an APIServer for running in antrea agent.
func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier, bindPort int,
enableMetrics bool, kubeconfig string, cipherSuites []uint16, tlsMinVersion uint16) (*agentAPIServer, error) {
cfg, err := newConfig(bindPort, enableMetrics, kubeconfig)
cfg, err := newConfig(npq, bindPort, enableMetrics, kubeconfig)
if err != nil {
return nil, err
}
Expand All @@ -110,7 +112,7 @@ func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier
return &agentAPIServer{GenericAPIServer: s}, nil
}

func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
secureServing := genericoptions.NewSecureServingOptions().WithLoopback()
authentication := genericoptions.NewDelegatingAuthenticationOptions()
authorization := genericoptions.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")
Expand Down Expand Up @@ -155,6 +157,14 @@ func newConfig(bindPort int, enableMetrics bool, kubeconfig string) (*genericapi
GitCommit: antreaversion.GetGitSHA(),
}
serverConfig.EnableMetrics = enableMetrics
// Add readiness probe to check the status of watchers.
check := healthz.NamedCheck("watcher", func(_ *http.Request) error {
if npq.GetControllerConnectionStatus() {
return nil
}
return fmt.Errorf("some watchers may not be connected")
})
serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, check)

completedServerCfg := serverConfig.Complete(nil)
return &completedServerCfg, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package networkpolicy
import (
"context"
"fmt"
"reflect"
"sync"
"time"

Expand Down Expand Up @@ -44,6 +45,8 @@ const (
defaultWorkers = 4
)

var emptyWatch = watch.NewEmptyWatch()

// Controller is responsible for watching Antrea AddressGroups, AppliedToGroups,
// and NetworkPolicies, feeding them to ruleCache, getting dirty rules from
// ruleCache, invoking reconciler to reconcile them.
Expand Down Expand Up @@ -584,6 +587,12 @@ func (w *watcher) watch() {
klog.Warningf("Failed to start watch for %s: %v", w.objectType, err)
return
}
// Watch method doesn't return error but "emptyWatch" in case of some partial data errors,
// e.g. timeout error. Make sure that watcher is not empty and log warning otherwise.
if reflect.TypeOf(watcher) == reflect.TypeOf(emptyWatch) {
klog.Warningf("Failed to start watch for %s, please ensure antrea service is reachable for the agent", w.objectType)
return
}

klog.Infof("Started watch for %s", w.objectType)
w.setConnected(true)
Expand Down

0 comments on commit c086437

Please sign in to comment.