Skip to content

Commit

Permalink
ipam: Warn when failing to update CN status
Browse files Browse the repository at this point in the history
This commit attempts to improve the readability and debugability of
(*Node).syncToAPIServer(). The main motivation for this change is to fix
the swallowing of the error when UpdateStatus fails, and the subsequent
Get succeeds.

To that end, this commit adds a warning log msg and propagates the error
in case we never succeed, to surface any trouble with updating the
status of the CiliumNode (CN) resource. Failures in updating the status
can be indicative of general connectivity problems in ENI and Azure IPAM
modes. As seen in cilium#13193, it can
also lead to Cilium never becoming ready because the agent was never
able to retrieve ENI information required to make allocations, etc. (The
former was caused by the Operator being blocked from "publishing" the
ENI information into the CN resource, which is why the agent was unable
to retrieve it.)

Signed-off-by: Chris Tarazi <chris@isovalent.com>
  • Loading branch information
christarazi authored and vadorovsky committed Sep 17, 2020
1 parent 407c415 commit a7451f8
Showing 1 changed file with 17 additions and 7 deletions.
24 changes: 17 additions & 7 deletions pkg/ipam/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
ipamTypes "github.com/cilium/cilium/pkg/ipam/types"
v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
"github.com/cilium/cilium/pkg/lock"
"github.com/cilium/cilium/pkg/logging/logfields"
"github.com/cilium/cilium/pkg/math"
"github.com/cilium/cilium/pkg/trigger"

Expand Down Expand Up @@ -686,10 +687,8 @@ func (n *Node) MaintainIPPool(ctx context.Context) error {
}

// syncToAPIServer is called to synchronize the node content with the custom
// resource in the apiserver
// resource in the apiserver.
func (n *Node) syncToAPIServer() (err error) {
var updatedNode *v2.CiliumNode

scopedLog := n.logger()
scopedLog.Debug("Refreshing node")

Expand All @@ -707,26 +706,37 @@ func (n *Node) syncToAPIServer() (err error) {
// Two attempts are made in case the local resource is outdated. If the
// second attempt fails as well we are likely under heavy contention,
// fall back to the controller based background interval to retry.
var updatedNode *v2.CiliumNode
for retry := 0; retry < 2; retry++ {
if node.Status.IPAM.Used == nil {
node.Status.IPAM.Used = ipamTypes.AllocationMap{}
}

var updateErr error
n.ops.PopulateStatusFields(node)
updatedNode, err = n.manager.k8sAPI.UpdateStatus(origNode, node)
updatedNode, updateErr = n.manager.k8sAPI.UpdateStatus(origNode, node)
if updatedNode != nil && updatedNode.Name != "" {
node = updatedNode.DeepCopy()
if err == nil {
if updateErr == nil {
break
}
} else if err != nil {
} else if updateErr != nil {
scopedLog.WithError(updateErr).WithFields(logrus.Fields{
logfields.Attempt: retry,
}).Warning("Failed to update CiliumNode status")

node, err = n.manager.k8sAPI.Get(node.Name)
if err != nil {
break
} else {
// Propagate the error in case we exit the loop without
// succeeding in updating the status.
err = updateErr
}
node = node.DeepCopy()
origNode = node.DeepCopy()
} else {
} else /* updateErr == nil */ {
err = updateErr
break
}
}
Expand Down

0 comments on commit a7451f8

Please sign in to comment.