Skip to content

Commit

Permalink
[pixie-ioGH-503] Operator should auto-renew certs
Browse files Browse the repository at this point in the history
Summary:
Currently users can get into a bad state where their TLS certs expire, and they need to redeploy Pixie in order to get things running again.
Instead, the operator should auto-detect when these certs are about to expire and do the renewal on behalf of the user.

Fixes pixie-io#503

Test Plan: Deploy operator and test with various certs

Reviewers: vihang, philkuz, nserrino

Reviewed By: philkuz, nserrino

Subscribers: nserrino, philkuz, vihang

Signed-off-by: Michelle Nguyen <michellenguyen@pixielabs.ai>

Differential Revision: https://phab.corp.pixielabs.ai/D12530

GitOrigin-RevId: 57d0ed362ab74d34a0863c7e4cb73c8f55ccd0c2
  • Loading branch information
aimichelle authored and copybaranaut committed Nov 15, 2022
1 parent b41be8e commit 8fbd3bb
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 6 deletions.
74 changes: 74 additions & 0 deletions src/operator/controllers/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ package controllers
import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/pem"
"fmt"
"io"
"net"
Expand All @@ -39,12 +41,14 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
"sigs.k8s.io/controller-runtime/pkg/client"

"px.dev/pixie/src/api/proto/cloudpb"
pixiev1alpha1 "px.dev/pixie/src/operator/apis/px.dev/v1alpha1"
"px.dev/pixie/src/shared/status"
"px.dev/pixie/src/utils/shared/k8s"
)

const (
Expand Down Expand Up @@ -105,6 +109,7 @@ func (c *concurrentPodMap) write(nameLabel, k8sName string, p *podWrapper) {
// for the overall Vizier instance.
type VizierMonitor struct {
clientset kubernetes.Interface
restConfig *rest.Config
factory informers.SharedInformerFactory
httpClient HTTPClient
ctx context.Context
Expand All @@ -118,6 +123,7 @@ type VizierMonitor struct {
podStates *concurrentPodMap
nodeState *vizierState
pvcState *vizierState
certState *vizierState

vzUpdate func(context.Context, client.Object, ...client.UpdateOption) error
vzGet func(context.Context, types.NamespacedName, client.Object) error
Expand All @@ -136,12 +142,15 @@ func (m *VizierMonitor) InitAndStartMonitor(cloudClient *grpc.ClientConn) {

m.nodeState = okState()
m.pvcState = okState()
m.certState = okState()

m.factory = informers.NewSharedInformerFactoryWithOptions(m.clientset, 0, informers.WithNamespace(m.namespace))

// Watch for pod updates in the namespace.
go m.watchK8sPods()

m.watchCerts()

// Start PVC monitor.
pvcStateCh := make(chan *vizierState)
pvcW := &pvcWatcher{
Expand Down Expand Up @@ -211,6 +220,48 @@ func (m *VizierMonitor) watchK8sPods() {
informer.Run(stopper)
}

func (m *VizierMonitor) watchCerts() {
err := m.checkCerts()
if err != nil {
log.WithError(err).Error("Failed to check certs")
}

timer := time.NewTicker(24 * time.Hour)
go func() {
for {
select {
case <-m.ctx.Done():
log.Info("Received cancel, stopping cert checker")
return
case <-timer.C:
err := m.checkCerts()
if err != nil {
log.WithError(err).Error("Failed to check certs")
}
}
}
}()
}

func (m *VizierMonitor) checkCerts() error {
tlsSecret, err := m.clientset.CoreV1().Secrets(m.namespace).Get(context.Background(), "service-tls-certs", metav1.GetOptions{})
if err != nil {
return err
}
cert, _ := pem.Decode(tlsSecret.Data["server.crt"])
x509cert, err := x509.ParseCertificate(cert.Bytes)
if err != nil {
log.WithError(err).Error("failed to parse cert")
return err
}
if time.Now().Add(5 * 24 * time.Hour).After(x509cert.NotAfter) {
m.certState = &vizierState{Reason: status.TLSCertsExpired}
return nil
}
m.certState = okState()
return nil
}

// vizierState details the state of Vizier at a snapshot.
type vizierState struct {
// Reason is the description of the state. Should only be set with values enumerated in `src/shared/status/vzstatus.go`
Expand Down Expand Up @@ -450,6 +501,10 @@ func (m *VizierMonitor) getVizierState(vz *pixiev1alpha1.Vizier) *vizierState {
return vzVersionState
}

if !isOk(m.certState) {
return m.certState
}

if !vz.Spec.UseEtcdOperator && !isOk(m.pvcState) {
return m.pvcState
}
Expand Down Expand Up @@ -540,6 +595,25 @@ func (m *VizierMonitor) repairVizier(state *vizierState) error {
}

log.Info("Successfully switched to etcd backed metadata store")
} else if state.Reason == status.TLSCertsExpired {
vz := &pixiev1alpha1.Vizier{}
err := m.vzGet(context.Background(), m.namespacedName, vz)
if err != nil {
log.WithError(err).Error("Failed to get vizier")
return err
}

err = deployCerts(context.Background(), m.namespace, vz, m.clientset, m.restConfig, true)
if err != nil {
log.WithError(err).Error("Failed to update certs")
}
m.certState = okState()

log.Info("Bouncing Vizier pods to get certs update")
err = k8s.DeletePods(m.clientset, m.namespace, "")
if err != nil {
return err
}
}

return nil
Expand Down
15 changes: 9 additions & 6 deletions src/operator/controllers/vizier_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ func (r *VizierReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
vzGet: r.Get,
clientset: r.Clientset,
vzSpecUpdate: r.Update,
restConfig: r.RestConfig,
}

cloudClient, err := getCloudClientConnection(vizier.Spec.CloudAddr, vizier.Spec.DevCloudNamespace, grpc.FailOnNonTempDialError(true), grpc.WithBlock())
Expand Down Expand Up @@ -233,6 +234,7 @@ func (r *VizierReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// updateVizier updates the vizier instance according to the spec.
func (r *VizierReconciler) updateVizier(ctx context.Context, req ctrl.Request, vz *v1alpha1.Vizier) error {
log.Info("Updating Vizier...")

checksum, err := getSpecChecksum(vz)
if err != nil {
return err
Expand Down Expand Up @@ -503,10 +505,11 @@ func (r *VizierReconciler) upgradeNats(ctx context.Context, namespace string, vz
return r.deployNATSStatefulset(ctx, namespace, vz, yamlMap)
}

// TODO(michellenguyen): Add a goroutine
// which checks when certs are about to expire. If they are about to expire,
// we should generate new certs and bounce all pods.
func (r *VizierReconciler) deployVizierCerts(ctx context.Context, namespace string, vz *v1alpha1.Vizier) error {
return deployCerts(ctx, namespace, vz, r.Clientset, r.RestConfig, false)
}

func deployCerts(ctx context.Context, namespace string, vz *v1alpha1.Vizier, clientset kubernetes.Interface, restConfig *rest.Config, update bool) error {
log.Info("Generating certs")

// Assign JWT signing key.
Expand All @@ -515,13 +518,13 @@ func (r *VizierReconciler) deployVizierCerts(ctx context.Context, namespace stri
if err != nil {
return err
}
s := k8s.GetSecret(r.Clientset, namespace, "pl-cluster-secrets")
s := k8s.GetSecret(clientset, namespace, "pl-cluster-secrets")
if s == nil {
return errors.New("pl-cluster-secrets does not exist")
}
s.Data[clusterSecretJWTKey] = []byte(fmt.Sprintf("%x", jwtSigningKey))

_, err = r.Clientset.CoreV1().Secrets(namespace).Update(ctx, s, metav1.UpdateOptions{})
_, err = clientset.CoreV1().Secrets(namespace).Update(ctx, s, metav1.UpdateOptions{})
if err != nil {
return err
}
Expand All @@ -542,7 +545,7 @@ func (r *VizierReconciler) deployVizierCerts(ctx context.Context, namespace stri
}
}

return k8s.ApplyResources(r.Clientset, r.RestConfig, resources, namespace, nil, false)
return k8s.ApplyResources(clientset, restConfig, resources, namespace, nil, update)
}

// deployVizierConfigs deploys the secrets, configmaps, and certs that are necessary for running vizier.
Expand Down
4 changes: 4 additions & 0 deletions src/shared/status/vzstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ var reasonToMessageMap = map[VizierReason]string{
"If this problem persists, clobber and re-deploy your Pixie instance",
PEMsHighFailureRate: "PEMs are experiencing a high crash rate. Your Pixie experience will be degraded while this occurs. If PEMs are getting OOMKilled, increase your PEM memory limits using the `pemMemoryLimit` flag.",
PEMsAllFailing: "PEMs are all crashing. If PEMs are getting OOMKilled, increase your PEM memory limits using the `pemMemoryLimit` flag. Otherwise, consider filing a bug so someone can address your problem: https://github.com/pixie-io/pixie",
TLSCertsExpired: "Service TLS certs are expired. If using the operator, the certs will be auto-regenerated. Otherwise, please redeploy Vizier.",
}

// VizierReason is the reason that Vizier is in its current state.
Expand Down Expand Up @@ -131,4 +132,7 @@ const (
PEMsHighFailureRate VizierReason = "PEMsHighFailureRate"
// PEMsAllFailing occurs when a all PEMs are failing.
PEMsAllFailing VizierReason = "PEMsAllFailing"

// TLSCertsExpired occurs when the service TLS certs are expired or almost expired.
TLSCertsExpired VizierReason = "TLSCertsExpired"
)

0 comments on commit 8fbd3bb

Please sign in to comment.