diff --git a/cmd/livenessprobe/main.go b/cmd/livenessprobe/main.go index 906bd90c..c62beb38 100644 --- a/cmd/livenessprobe/main.go +++ b/cmd/livenessprobe/main.go @@ -23,10 +23,8 @@ import ( "net" "net/http" "os" - "sync" "time" - "google.golang.org/grpc" "k8s.io/klog/v2" "k8s.io/component-base/featuregate" @@ -62,7 +60,7 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) { ctx, cancel := context.WithTimeout(req.Context(), *probeTimeout) defer cancel() - conn, err := acquireConnection(ctx, h.metricsManager) + conn, err := connlib.Connect(*csiAddress, h.metricsManager, connlib.WithTimeout(*probeTimeout)) if err != nil { w.WriteHeader(http.StatusInternalServerError) w.Write([]byte(err.Error())) @@ -92,37 +90,6 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) { klog.V(5).InfoS("Health check succeeded") } -// acquireConnection wraps the connlib.Connect but adding support to context -// cancelation. -func acquireConnection(ctx context.Context, metricsManager metrics.CSIMetricsManager) (conn *grpc.ClientConn, err error) { - - var m sync.Mutex - var canceled bool - ready := make(chan bool) - go func() { - conn, err = connlib.Connect(*csiAddress, metricsManager) - - m.Lock() - defer m.Unlock() - if err != nil && canceled && conn != nil { - conn.Close() - } - - close(ready) - }() - - select { - case <-ctx.Done(): - m.Lock() - defer m.Unlock() - canceled = true - return nil, ctx.Err() - - case <-ready: - return conn, err - } -} - func main() { fg := featuregate.NewFeatureGate() logsapi.AddFeatureGates(fg) @@ -151,10 +118,14 @@ func main() { } metricsManager := metrics.NewCSIMetricsManager("" /* driverName */) - csiConn, err := acquireConnection(context.Background(), metricsManager) + // Connect to the CSI driver without any timeout to avoid crashing the probe when the driver is not ready yet. + // Goal: liveness probe never crashes, it only fails the probe when the driver is not available (yet). + // Since a http server for the probe is not running at this point, Kubernetes liveness probe will fail immediately + // with "connection refused", which is good enough to fail the probe. + csiConn, err := connlib.Connect(*csiAddress, metricsManager, connlib.WithTimeout(0)) if err != nil { // connlib should retry forever so a returned error should mean - // the grpc client is misconfigured rather than an error on the network + // the grpc client is misconfigured rather than an error on the network or CSI driver. klog.ErrorS(err, "Failed to establish connection to CSI driver") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } @@ -163,6 +134,7 @@ func main() { csiDriverName, err := rpc.GetDriverName(context.Background(), csiConn) csiConn.Close() if err != nil { + // The CSI driver does not support GetDriverName, which is serious enough to crash the probe. klog.ErrorS(err, "Failed to get CSI driver name") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } diff --git a/go.mod b/go.mod index 787cc161..66f1999c 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ require ( github.com/golang/mock v1.6.0 github.com/kubernetes-csi/csi-lib-utils v0.17.0 github.com/kubernetes-csi/csi-test/v5 v5.2.0 - google.golang.org/grpc v1.60.1 k8s.io/component-base v0.29.0 k8s.io/klog/v2 v2.110.1 ) @@ -46,6 +45,7 @@ require ( golang.org/x/sys v0.14.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20231106174013-bbf56f31fb17 // indirect + google.golang.org/grpc v1.60.1 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect