Skip to content

Commit

Permalink
Add more informative error messaging in operator
Browse files Browse the repository at this point in the history
Summary:
We recently added sentry to our operator, which has resulted in some spammy errors.

For errors where the vizier status fails to update due to a timeout, this usually means the K8s API has become overloaded and cannot respond in time. Updated this to have more informative messaging.
We have another error when the Vizier deploy fails, but its unclear at which stage it is failing. Updated the errors to be more fine-grained.

Test Plan: Skaffold ran operator

Reviewers: htroisi, philkuz

Reviewed By: htroisi

Signed-off-by: Michelle Nguyen <michellenguyen@pixielabs.ai>

Differential Revision: https://phab.corp.pixielabs.ai/D12534

GitOrigin-RevId: ddb199537f8ecac2f294927adce1e96a8a098853
  • Loading branch information
aimichelle authored and copybaranaut committed Nov 16, 2022
1 parent 0cf001b commit 60a463b
Showing 1 changed file with 24 additions and 5 deletions.
29 changes: 24 additions & 5 deletions src/operator/controllers/vizier_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,11 @@ func (r *VizierReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
vizier.SetStatus(status.UnableToConnectToCloud)
err := r.Status().Update(ctx, &vizier)
if err != nil {
log.WithError(err).Error("Failed to update vizier status")
if strings.Contains(err.Error(), "timeout") {
log.WithError(err).Info("Timed out trying to update vizier status. K8s API server may be overloaded")
} else {
log.WithError(err).Error("Failed to update vizier status")
}
}
log.WithError(err).Error("Failed to connect to Pixie cloud")
return ctrl.Result{}, err
Expand All @@ -223,7 +227,11 @@ func (r *VizierReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
vizier.Status.OperatorVersion = version.GetVersion().ToString()
err = r.Status().Update(ctx, &vizier)
if err != nil {
log.WithError(err).Error("Failed to update vizier status")
if strings.Contains(err.Error(), "timeout") {
log.WithError(err).Info("Timed out trying to update vizier status. K8s API server may be overloaded")
} else {
log.WithError(err).Error("Failed to update vizier status")
}
}
}

Expand Down Expand Up @@ -283,7 +291,11 @@ func (r *VizierReconciler) createVizier(ctx context.Context, req ctrl.Request, v
vz.SetStatus(status.UnableToConnectToCloud)
err := r.Status().Update(ctx, vz)
if err != nil {
log.WithError(err).Error("Failed to update vizier status")
if strings.Contains(err.Error(), "timeout") {
log.WithError(err).Info("Timed out trying to update vizier status. K8s API server may be overloaded")
} else {
log.WithError(err).Error("Failed to update vizier status")
}
}
log.WithError(err).Error("Failed to connect to Pixie cloud")
return err
Expand Down Expand Up @@ -317,7 +329,11 @@ func (r *VizierReconciler) deployVizier(ctx context.Context, req ctrl.Request, v
vz.SetStatus(status.UnableToConnectToCloud)
err := r.Status().Update(ctx, vz)
if err != nil {
log.WithError(err).Error("Failed to update vizier status")
if strings.Contains(err.Error(), "timeout") {
log.WithError(err).Info("Timed out trying to update vizier status. K8s API server may be overloaded")
} else {
log.WithError(err).Error("Failed to update vizier status")
}
}
log.WithError(err).Error("Failed to connect to Pixie cloud")
return err
Expand Down Expand Up @@ -416,7 +432,7 @@ func (r *VizierReconciler) deployVizier(ctx context.Context, req ctrl.Request, v

err = r.deployVizierCore(ctx, req.Namespace, vz, yamlMap, update)
if err != nil {
log.WithError(err).Error("Failed to deploy Vizier core")
log.WithError(err).Info("Failed to deploy Vizier core")
return err
}

Expand Down Expand Up @@ -621,6 +637,7 @@ func (r *VizierReconciler) deployVizierCore(ctx context.Context, namespace strin

resources, err := k8s.GetResourcesFromYAML(strings.NewReader(yamlMap[vzYaml]))
if err != nil {
log.WithError(err).Error("Error getting resources from Vizier YAML")
return err
}

Expand All @@ -638,11 +655,13 @@ func (r *VizierReconciler) deployVizierCore(ctx context.Context, namespace strin
for _, r := range resources {
err = updateResourceConfiguration(r, vz)
if err != nil {
log.WithError(err).Error("Failed to update resource configuration for resources")
return err
}
}
err = retryDeploy(r.Clientset, r.RestConfig, namespace, resources, allowUpdate)
if err != nil {
log.WithError(err).Error("Retry deploy of Vizier failed")
return err
}

Expand Down

0 comments on commit 60a463b

Please sign in to comment.