Skip to content

Commit

Permalink
[node-agent] Do not try fetching node name on start-up (gardener#8826)
Browse files Browse the repository at this point in the history
* Do not try fetching node name on start-up

For newly created VMs, the node name is not known yet since the kubelet has to be installed first. Since GNA is the component that installs the kubelet, it doesn't make sense to try to fetch the node name on start-up.
Instead, the OSC controller tries fetching it as part of its reconciliation flow.

* Fix event recording

In Go, `nil` is not always `nil` - there are many topics and even official FAQ statements on this topic.
In our case, `recordEvent` takes a `runtime.Object` while the `executeUnitCommands` function (https://github.com/gardener/gardener/blob/eba989bc166232707bb1309c7df61cc6fa1ee8ca/pkg/nodeagent/controller/operatingsystemconfig/reconciler.go#L343) takes the `node` argument with type `client.Object`. When the node is not known yet (e.g., newly created VM), the `node !=nil` check in `recordEvent` results to `true` even though the value type is `nil` - but the interface type is not `nil`. This is due to the "conversion" on the way from `*corev1.Node` to `client.Object` and `runtime.Object`, finally.

Co-Authored-By: Johannes Scheerer <johannes.scheerer@sap.com>

---------

Co-authored-by: Johannes Scheerer <johannes.scheerer@sap.com>
  • Loading branch information
rfranzke and ScheererJ authored Nov 17, 2023
1 parent 9ba2ba0 commit 72ce855
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 60 deletions.
52 changes: 9 additions & 43 deletions cmd/gardener-node-agent/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import (
"github.com/spf13/cobra"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/rest"
"k8s.io/component-base/version/verflag"
"k8s.io/utils/pointer"
Expand Down Expand Up @@ -146,10 +146,10 @@ func run(ctx context.Context, cancel context.CancelFunc, log logr.Logger, cfg *c
}
}

log.Info("Fetching node name based on hostname")
nodeName, err := getNodeName(ctx, log, restConfig)
log.Info("Fetching hostname")
hostName, err := os.Hostname()
if err != nil {
return err
return fmt.Errorf("failed fetching hostname: %w", err)
}

log.Info("Setting up manager")
Expand All @@ -164,16 +164,10 @@ func run(ctx context.Context, cancel context.CancelFunc, log logr.Logger, cfg *c
ExtraHandlers: extraHandlers,
},

Cache: cache.Options{
ByObject: map[client.Object]cache.ByObject{
&corev1.Secret{}: {
Namespaces: map[string]cache.Config{metav1.NamespaceSystem: {}},
},
&corev1.Node{}: {
Field: fields.SelectorFromSet(fields.Set{metav1.ObjectNameField: nodeName}),
},
},
},
Cache: cache.Options{ByObject: map[client.Object]cache.ByObject{
&corev1.Secret{}: {Namespaces: map[string]cache.Config{metav1.NamespaceSystem: {}}},
&corev1.Node{}: {Label: labels.SelectorFromSet(labels.Set{corev1.LabelHostname: hostName})},
}},
LeaderElection: false,
Controller: controllerconfig.Controller{
RecoverPanic: pointer.Bool(true),
Expand All @@ -192,7 +186,7 @@ func run(ctx context.Context, cancel context.CancelFunc, log logr.Logger, cfg *c
}

log.Info("Adding controllers to manager")
if err := controller.AddToManager(cancel, mgr, cfg, nodeName); err != nil {
if err := controller.AddToManager(cancel, mgr, cfg, hostName); err != nil {
return fmt.Errorf("failed adding controllers to manager: %w", err)
}

Expand Down Expand Up @@ -260,31 +254,3 @@ func fetchAccessToken(ctx context.Context, log logr.Logger, restConfig *rest.Con
restConfig.BearerTokenFile = nodeagentv1alpha1.TokenFilePath
return nil
}

func getNodeName(ctx context.Context, log logr.Logger, restConfig *rest.Config) (string, error) {
hostname, err := os.Hostname()
if err != nil {
return "", fmt.Errorf("failed fetching hostname: %w", err)
}

cl, err := client.New(restConfig, client.Options{})
if err != nil {
return "", fmt.Errorf("unable to create client: %w", err)
}

nodeList := &metav1.PartialObjectMetadataList{}
nodeList.SetGroupVersionKind(corev1.SchemeGroupVersion.WithKind("NodeList"))
if err := cl.List(ctx, nodeList, client.MatchingLabels{corev1.LabelHostname: hostname}); err != nil {
return "", err
}

switch len(nodeList.Items) {
case 0:
return "", fmt.Errorf("could not find any node with label %s=%s", corev1.LabelHostname, hostname)
case 1:
log.Info("Found node name based on hostname", "hostname", hostname, "nodeName", nodeList.Items[0].Name)
return nodeList.Items[0].Name, nil
default:
return "", fmt.Errorf("found more than one node with label %s=%s", corev1.LabelHostname, hostname)
}
}
4 changes: 2 additions & 2 deletions pkg/nodeagent/controller/add.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ import (
)

// AddToManager adds all controllers to the given manager.
func AddToManager(cancel context.CancelFunc, mgr manager.Manager, cfg *config.NodeAgentConfiguration, nodeName string) error {
func AddToManager(cancel context.CancelFunc, mgr manager.Manager, cfg *config.NodeAgentConfiguration, hostName string) error {
if err := (&node.Reconciler{}).AddToManager(mgr); err != nil {
return fmt.Errorf("failed adding node controller: %w", err)
}

if err := (&operatingsystemconfig.Reconciler{
Config: cfg.Controllers.OperatingSystemConfig,
NodeName: nodeName,
HostName: hostName,
CancelContext: cancel,
}).AddToManager(mgr); err != nil {
return fmt.Errorf("failed adding operating system config controller: %w", err)
Expand Down
45 changes: 35 additions & 10 deletions pkg/nodeagent/controller/operatingsystemconfig/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ type Reconciler struct {
Recorder record.EventRecorder
DBus dbus.DBus
FS afero.Afero
NodeName string
Extractor registry.Extractor
CancelContext context.CancelFunc
HostName string
nodeName string
}

// Reconcile decodes the OperatingSystemConfig resources from secrets and applies the systemd units and files to the
Expand All @@ -84,23 +85,19 @@ func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
return reconcile.Result{}, fmt.Errorf("error retrieving object from store: %w", err)
}

node := &metav1.PartialObjectMetadata{ObjectMeta: metav1.ObjectMeta{Name: r.NodeName}}
node.SetGroupVersionKind(corev1.SchemeGroupVersion.WithKind("Node"))
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(node), node); err != nil {
if !apierrors.IsNotFound(err) {
return reconcile.Result{}, fmt.Errorf("unable to fetch node %q: %w", r.NodeName, err)
}
node = nil
node, err := r.getNode(ctx)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed getting node: %w", err)
}

osc, oscRaw, oscChecksum, err := extractOSCFromSecret(secret)
if err != nil {
return reconcile.Result{}, err
return reconcile.Result{}, fmt.Errorf("failed extracting OSC from secret: %w", err)
}

oscChanges, err := computeOperatingSystemConfigChanges(r.FS, osc)
if err != nil {
return reconcile.Result{}, fmt.Errorf("unable to calculate the OSC changes: %w", err)
return reconcile.Result{}, fmt.Errorf("failed calculating the OSC changes: %w", err)
}

if node != nil && node.Annotations[executor.AnnotationKeyChecksum] == oscChecksum {
Expand Down Expand Up @@ -170,6 +167,34 @@ func (r *Reconciler) Reconcile(ctx context.Context, request reconcile.Request) (
return reconcile.Result{RequeueAfter: r.Config.SyncPeriod.Duration}, r.Client.Patch(ctx, node, patch)
}

func (r *Reconciler) getNode(ctx context.Context) (*metav1.PartialObjectMetadata, error) {
if r.nodeName != "" {
node := &metav1.PartialObjectMetadata{ObjectMeta: metav1.ObjectMeta{Name: r.nodeName}}
node.SetGroupVersionKind(corev1.SchemeGroupVersion.WithKind("Node"))
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(node), node); err != nil {
return nil, fmt.Errorf("unable to fetch node %q: %w", r.nodeName, err)
}
return node, nil
}

// node name not known yet, try to fetch it via label selector based on hostname
nodeList := &metav1.PartialObjectMetadataList{}
nodeList.SetGroupVersionKind(corev1.SchemeGroupVersion.WithKind("NodeList"))
if err := r.Client.List(ctx, nodeList, client.MatchingLabels{corev1.LabelHostname: r.HostName}); err != nil {
return nil, fmt.Errorf("unable to list nodes with label selector %s=%s: %w", corev1.LabelHostname, r.HostName, err)
}

switch len(nodeList.Items) {
case 0:
return nil, nil
case 1:
r.nodeName = nodeList.Items[0].Name
return &nodeList.Items[0], nil
default:
return nil, fmt.Errorf("found more than one node with label %s=%s", corev1.LabelHostname, r.HostName)
}
}

var (
etcSystemdSystem = path.Join("/", "etc", "systemd", "system")
defaultFilePermissions os.FileMode = 0600
Expand Down
3 changes: 2 additions & 1 deletion pkg/nodeagent/dbus/dbus.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package dbus
import (
"context"
"fmt"
"reflect"

"github.com/coreos/go-systemd/v22/dbus"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -151,7 +152,7 @@ func (_ *db) DaemonReload(ctx context.Context) error {
}

func recordEvent(recorder record.EventRecorder, node runtime.Object, err error, unitName, reason, operation string) {
if recorder != nil && node != nil {
if recorder != nil && node != nil && !reflect.ValueOf(node).IsNil() { // nil is not nil :(
var (
eventType = corev1.EventTypeNormal
message = fmt.Sprintf("processed %s of unit %s", operation, unitName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ var _ = Describe("OperatingSystemConfig controller tests", func() {
oscSecretName = testRunID
kubernetesVersion = semver.MustParse("1.2.3")

node *corev1.Node
hostName = "test-hostname"
node *corev1.Node

file1, file2, file3, file4, file5 extensionsv1alpha1.File
gnaUnit, unit1, unit2, unit3, unit4, unit5, unit5DropInsOnly, unit6, unit7 extensionsv1alpha1.Unit
Expand Down Expand Up @@ -86,8 +87,11 @@ var _ = Describe("OperatingSystemConfig controller tests", func() {

node = &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: testRunID,
Labels: map[string]string{testID: testRunID},
Name: testRunID,
Labels: map[string]string{
testID: testRunID,
"kubernetes.io/hostname": hostName,
},
},
}

Expand All @@ -107,7 +111,7 @@ var _ = Describe("OperatingSystemConfig controller tests", func() {
},
DBus: fakeDBus,
FS: fakeFS,
NodeName: node.Name,
HostName: hostName,
Extractor: fakeregistry.NewExtractor(fakeFS, imageMountDirectory),
CancelContext: cancelFunc.cancel,
}).AddToManager(mgr)).To(Succeed())
Expand Down

0 comments on commit 72ce855

Please sign in to comment.