Fix bug: we should start conversion webhook before initiating Longhorn

datastore longhorn-8578 Signed-off-by: Phan Le <phan.le@suse.com>
longhorn · May 22, 2024 · 1d1c828 · 1d1c828
1 parent ebace7b
commit 1d1c828
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 26 deletions.
diff --git a/app/daemon.go b/app/daemon.go
@@ -148,22 +148,30 @@ func startManager(c *cli.Context) error {
 
 	logger := logrus.StandardLogger().WithField("node", currentNodeID)
 
-	clients, err := client.NewClients(kubeconfigPath, ctx.Done())
+	// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
+	// longhorn-manager pod becomes ready only when conversion webhook is running.
+	// The services in the longhorn-manager can then start to receive the requests.
+	// Conversion webhook does not longhorn datastore.
+	clientsWithoutDatastore, err := client.NewClients(kubeconfigPath, false, ctx.Done())
 	if err != nil {
 		return err
 	}
+	if err := webhook.StartWebhook(ctx, types.WebhookTypeConversion, clientsWithoutDatastore); err != nil {
+		return err
+	}
+	if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeConversion); err != nil {
+		return err
+	}
 
-	// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
-	// longhorn-manager pod becomes ready only when conversion webhook is running.
-	// The services in the longhorn-manager can then start to receive the requests.
-	webhookTypes := []string{types.WebhookTypeConversion, types.WebhookTypeAdmission}
-	for _, webhookType := range webhookTypes {
-		if err := webhook.StartWebhook(ctx, webhookType, clients); err != nil {
-			return err
-		}
-		if err := webhook.CheckWebhookServiceAvailability(webhookType); err != nil {
-			return err
-		}
+	clients, err := client.NewClients(kubeconfigPath, true, ctx.Done())
+	if err != nil {
+		return err
+	}
+	if err := webhook.StartWebhook(ctx, types.WebhookTypeAdmission, clients); err != nil {
+		return err
+	}
+	if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeAdmission); err != nil {
+		return err
 	}
 
 	if err := recoverybackend.StartRecoveryBackend(clients); err != nil {

diff --git a/util/client/client.go b/util/client/client.go
@@ -9,7 +9,6 @@ import (
 	wranglerClients "github.com/rancher/wrangler/pkg/clients"
 	wranglerSchemes "github.com/rancher/wrangler/pkg/schemes"
 	"github.com/sirupsen/logrus"
-
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/tools/clientcmd"
 
@@ -34,7 +33,7 @@ type Clients struct {
 	StopCh        <-chan struct{}
 }
 
-func NewClients(kubeconfigPath string, stopCh <-chan struct{}) (*Clients, error) {
+func NewClients(kubeconfigPath string, needDataStore bool, stopCh <-chan struct{}) (*Clients, error) {
 	namespace := os.Getenv(types.EnvPodNamespace)
 	if namespace == "" {
 		logrus.Warnf("Cannot detect pod namespace, environment variable %v is missing, using default namespace", types.EnvPodNamespace)
@@ -82,18 +81,21 @@ func NewClients(kubeconfigPath string, stopCh <-chan struct{}) (*Clients, error)
 		return nil, errors.Wrap(err, "unable to get metrics client")
 	}
 
-	// TODO: there shouldn't be a need for a 30s resync period unless our code is buggy and our controllers aren't really
-	//  level based. What we are effectively doing with this is hiding faulty logic in production.
-	//  Another reason for increasing this substantially, is that it introduces a lot of unnecessary work and will
-	//  lead to scalability problems, since we dump the whole cache of each object back in to the reconciler every 30 seconds.
-	//  if a specific controller requires a periodic resync, one enable it only for that informer, add a resync to the event handler, go routine, etc.
-	//  some refs to look at: https://github.com/kubernetes-sigs/controller-runtime/issues/521
-	informerFactories := util.NewInformerFactories(namespace, clients.K8s, lhClient, 30*time.Second)
-	ds := datastore.NewDataStore(namespace, lhClient, clients.K8s, extensionsClient, informerFactories)
-
-	informerFactories.Start(stopCh)
-	if !ds.Sync(stopCh) {
-		return nil, fmt.Errorf("datastore cache sync up failed")
+	var ds *datastore.DataStore
+	if needDataStore {
+		// TODO: there shouldn't be a need for a 30s resync period unless our code is buggy and our controllers aren't really
+		//  level based. What we are effectively doing with this is hiding faulty logic in production.
+		//  Another reason for increasing this substantially, is that it introduces a lot of unnecessary work and will
+		//  lead to scalability problems, since we dump the whole cache of each object back in to the reconciler every 30 seconds.
+		//  if a specific controller requires a periodic resync, one enable it only for that informer, add a resync to the event handler, go routine, etc.
+		//  some refs to look at: https://github.com/kubernetes-sigs/controller-runtime/issues/521
+		informerFactories := util.NewInformerFactories(namespace, clients.K8s, lhClient, 30*time.Second)
+		ds = datastore.NewDataStore(namespace, lhClient, clients.K8s, extensionsClient, informerFactories)
+
+		informerFactories.Start(stopCh)
+		if !ds.Sync(stopCh) {
+			return nil, fmt.Errorf("datastore cache sync up failed")
+		}
 	}
 
 	return &Clients{