Skip to content

Commit

Permalink
Fix bug: we should start conversion webhook before initiating Longhorn
Browse files Browse the repository at this point in the history
datastore

longhorn-8578

Signed-off-by: Phan Le <phan.le@suse.com>
  • Loading branch information
PhanLe1010 authored and innobead committed May 22, 2024
1 parent ebace7b commit 1d1c828
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 26 deletions.
32 changes: 20 additions & 12 deletions app/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,22 +148,30 @@ func startManager(c *cli.Context) error {

logger := logrus.StandardLogger().WithField("node", currentNodeID)

clients, err := client.NewClients(kubeconfigPath, ctx.Done())
// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
// longhorn-manager pod becomes ready only when conversion webhook is running.
// The services in the longhorn-manager can then start to receive the requests.
// Conversion webhook does not longhorn datastore.
clientsWithoutDatastore, err := client.NewClients(kubeconfigPath, false, ctx.Done())
if err != nil {
return err
}
if err := webhook.StartWebhook(ctx, types.WebhookTypeConversion, clientsWithoutDatastore); err != nil {
return err
}
if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeConversion); err != nil {
return err
}

// Conversion webhook needs to be started first since we use its port 9501 as readiness port.
// longhorn-manager pod becomes ready only when conversion webhook is running.
// The services in the longhorn-manager can then start to receive the requests.
webhookTypes := []string{types.WebhookTypeConversion, types.WebhookTypeAdmission}
for _, webhookType := range webhookTypes {
if err := webhook.StartWebhook(ctx, webhookType, clients); err != nil {
return err
}
if err := webhook.CheckWebhookServiceAvailability(webhookType); err != nil {
return err
}
clients, err := client.NewClients(kubeconfigPath, true, ctx.Done())
if err != nil {
return err
}
if err := webhook.StartWebhook(ctx, types.WebhookTypeAdmission, clients); err != nil {
return err
}
if err := webhook.CheckWebhookServiceAvailability(types.WebhookTypeAdmission); err != nil {
return err
}

if err := recoverybackend.StartRecoveryBackend(clients); err != nil {
Expand Down
30 changes: 16 additions & 14 deletions util/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
wranglerClients "github.com/rancher/wrangler/pkg/clients"
wranglerSchemes "github.com/rancher/wrangler/pkg/schemes"
"github.com/sirupsen/logrus"

"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/clientcmd"

Expand All @@ -34,7 +33,7 @@ type Clients struct {
StopCh <-chan struct{}
}

func NewClients(kubeconfigPath string, stopCh <-chan struct{}) (*Clients, error) {
func NewClients(kubeconfigPath string, needDataStore bool, stopCh <-chan struct{}) (*Clients, error) {
namespace := os.Getenv(types.EnvPodNamespace)
if namespace == "" {
logrus.Warnf("Cannot detect pod namespace, environment variable %v is missing, using default namespace", types.EnvPodNamespace)
Expand Down Expand Up @@ -82,18 +81,21 @@ func NewClients(kubeconfigPath string, stopCh <-chan struct{}) (*Clients, error)
return nil, errors.Wrap(err, "unable to get metrics client")
}

// TODO: there shouldn't be a need for a 30s resync period unless our code is buggy and our controllers aren't really
// level based. What we are effectively doing with this is hiding faulty logic in production.
// Another reason for increasing this substantially, is that it introduces a lot of unnecessary work and will
// lead to scalability problems, since we dump the whole cache of each object back in to the reconciler every 30 seconds.
// if a specific controller requires a periodic resync, one enable it only for that informer, add a resync to the event handler, go routine, etc.
// some refs to look at: https://github.com/kubernetes-sigs/controller-runtime/issues/521
informerFactories := util.NewInformerFactories(namespace, clients.K8s, lhClient, 30*time.Second)
ds := datastore.NewDataStore(namespace, lhClient, clients.K8s, extensionsClient, informerFactories)

informerFactories.Start(stopCh)
if !ds.Sync(stopCh) {
return nil, fmt.Errorf("datastore cache sync up failed")
var ds *datastore.DataStore
if needDataStore {
// TODO: there shouldn't be a need for a 30s resync period unless our code is buggy and our controllers aren't really
// level based. What we are effectively doing with this is hiding faulty logic in production.
// Another reason for increasing this substantially, is that it introduces a lot of unnecessary work and will
// lead to scalability problems, since we dump the whole cache of each object back in to the reconciler every 30 seconds.
// if a specific controller requires a periodic resync, one enable it only for that informer, add a resync to the event handler, go routine, etc.
// some refs to look at: https://github.com/kubernetes-sigs/controller-runtime/issues/521
informerFactories := util.NewInformerFactories(namespace, clients.K8s, lhClient, 30*time.Second)
ds = datastore.NewDataStore(namespace, lhClient, clients.K8s, extensionsClient, informerFactories)

informerFactories.Start(stopCh)
if !ds.Sync(stopCh) {
return nil, fmt.Errorf("datastore cache sync up failed")
}
}

return &Clients{
Expand Down

0 comments on commit 1d1c828

Please sign in to comment.