Skip to content

Commit bdc1139

Browse files
authored
feat: add support for health-check flag (#1271)
1 parent d5858bd commit bdc1139

File tree

11 files changed

+673
-60
lines changed

11 files changed

+673
-60
lines changed

cmd/root.go

Lines changed: 62 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"contrib.go.opencensus.io/exporter/prometheus"
3333
"contrib.go.opencensus.io/exporter/stackdriver"
3434
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/cloudsql"
35+
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/internal/healthcheck"
3536
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/internal/log"
3637
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/internal/proxy"
3738
"github.com/spf13/cobra"
@@ -76,6 +77,7 @@ type Command struct {
7677
telemetryProject string
7778
telemetryPrefix string
7879
prometheusNamespace string
80+
healthCheck bool
7981
httpPort string
8082
}
8183

@@ -157,7 +159,6 @@ When this flag is not set, there is no limit.`)
157159
to close after receiving a TERM signal. The proxy will shut
158160
down when the number of open connections reaches 0 or when
159161
the maximum time has passed. Defaults to 0s.`)
160-
161162
cmd.PersistentFlags().StringVar(&c.telemetryProject, "telemetry-project", "",
162163
"Enable Cloud Monitoring and Cloud Trace integration with the provided project ID.")
163164
cmd.PersistentFlags().BoolVar(&c.disableTraces, "disable-traces", false,
@@ -172,12 +173,16 @@ the maximum time has passed. Defaults to 0s.`)
172173
"Enable Prometheus for metric collection using the provided namespace")
173174
cmd.PersistentFlags().StringVar(&c.httpPort, "http-port", "9090",
174175
"Port for the Prometheus server to use")
176+
cmd.PersistentFlags().BoolVar(&c.healthCheck, "health-check", false,
177+
`Enables HTTP endpoints /startup, /liveness, and /readiness
178+
that report on the proxy's health. Endpoints are available on localhost
179+
only. Uses the port specified by the http-port flag.`)
175180
cmd.PersistentFlags().StringVar(&c.conf.APIEndpointURL, "sqladmin-api-endpoint", "",
176181
"When set, the proxy uses this url as the API endpoint for all Cloud SQL Admin API requests.\nExample: https://sqladmin.googleapis.com")
177182
cmd.PersistentFlags().StringVar(&c.conf.QuotaProject, "quota-project", "",
178183
`Specifies the project to use for Cloud SQL Admin API quota tracking.
179184
The IAM principal must have the "serviceusage.services.use" permission
180-
for the given project. See https://cloud.google.com/service-usage/docs/overview and
185+
for the given project. See https://cloud.google.com/service-usage/docs/overview and
181186
https://cloud.google.com/storage/docs/requester-pays`)
182187

183188
// Global and per instance flags
@@ -225,18 +230,18 @@ func parseConfig(cmd *Command, conf *proxy.Config, args []string) error {
225230
return newBadCommandError("cannot specify --credentials-file and --gcloud-auth flags at the same time")
226231
}
227232

228-
if userHasSet("http-port") && !userHasSet("prometheus-namespace") {
229-
return newBadCommandError("cannot specify --http-port without --prometheus-namespace")
233+
if userHasSet("http-port") && !userHasSet("prometheus-namespace") && !userHasSet("health-check") {
234+
cmd.logger.Infof("Ignoring --http-port because --prometheus-namespace or --health-check was not set")
230235
}
231236

232237
if !userHasSet("telemetry-project") && userHasSet("telemetry-prefix") {
233-
cmd.logger.Infof("Ignoring telementry-prefix as telemetry-project was not set")
238+
cmd.logger.Infof("Ignoring --telementry-prefix because --telemetry-project was not set")
234239
}
235240
if !userHasSet("telemetry-project") && userHasSet("disable-metrics") {
236-
cmd.logger.Infof("Ignoring disable-metrics as telemetry-project was not set")
241+
cmd.logger.Infof("Ignoring --disable-metrics because --telemetry-project was not set")
237242
}
238243
if !userHasSet("telemetry-project") && userHasSet("disable-traces") {
239-
cmd.logger.Infof("Ignoring disable-traces as telemetry-project was not set")
244+
cmd.logger.Infof("Ignoring --disable-traces because --telemetry-project was not set")
240245
}
241246

242247
if userHasSet("sqladmin-api-endpoint") && conf.APIEndpointURL != "" {
@@ -364,9 +369,8 @@ func runSignalWrapper(cmd *Command) error {
364369
ctx, cancel := context.WithCancel(cmd.Context())
365370
defer cancel()
366371

367-
// Configure Cloud Trace and/or Cloud Monitoring based on command
368-
// invocation. If a project has not been enabled, no traces or metrics are
369-
// enabled.
372+
// Configure collectors before the proxy has started to ensure we are
373+
// collecting metrics before *ANY* Cloud SQL Admin API calls are made.
370374
enableMetrics := !cmd.disableMetrics
371375
enableTraces := !cmd.disableTraces
372376
if cmd.telemetryProject != "" && (enableMetrics || enableTraces) {
@@ -394,40 +398,22 @@ func runSignalWrapper(cmd *Command) error {
394398
}()
395399
}
396400

397-
shutdownCh := make(chan error)
398-
401+
var (
402+
needsHTTPServer bool
403+
mux = http.NewServeMux()
404+
)
399405
if cmd.prometheusNamespace != "" {
406+
needsHTTPServer = true
400407
e, err := prometheus.NewExporter(prometheus.Options{
401408
Namespace: cmd.prometheusNamespace,
402409
})
403410
if err != nil {
404411
return err
405412
}
406-
mux := http.NewServeMux()
407413
mux.Handle("/metrics", e)
408-
addr := fmt.Sprintf("localhost:%s", cmd.httpPort)
409-
server := &http.Server{Addr: addr, Handler: mux}
410-
go func() {
411-
select {
412-
case <-ctx.Done():
413-
// Give the HTTP server a second to shutdown cleanly.
414-
ctx2, _ := context.WithTimeout(context.Background(), time.Second)
415-
if err := server.Shutdown(ctx2); err != nil {
416-
cmd.logger.Errorf("failed to shutdown Prometheus HTTP server: %v\n", err)
417-
}
418-
}
419-
}()
420-
go func() {
421-
err := server.ListenAndServe()
422-
if err == http.ErrServerClosed {
423-
return
424-
}
425-
if err != nil {
426-
shutdownCh <- fmt.Errorf("failed to start prometheus HTTP server: %v", err)
427-
}
428-
}()
429414
}
430415

416+
shutdownCh := make(chan error)
431417
// watch for sigterm / sigint signals
432418
signals := make(chan os.Signal, 1)
433419
signal.Notify(signals, syscall.SIGTERM, syscall.SIGINT)
@@ -465,18 +451,55 @@ func runSignalWrapper(cmd *Command) error {
465451
cmd.logger.Errorf("The proxy has encountered a terminal error: %v", err)
466452
return err
467453
case p = <-startCh:
454+
cmd.logger.Infof("The proxy has started successfully and is ready for new connections!")
468455
}
469-
cmd.logger.Infof("The proxy has started successfully and is ready for new connections!")
470-
defer p.Close()
471456
defer func() {
472457
if cErr := p.Close(); cErr != nil {
473458
cmd.logger.Errorf("error during shutdown: %v", cErr)
474459
}
475460
}()
476461

477-
go func() {
478-
shutdownCh <- p.Serve(ctx)
479-
}()
462+
notify := func() {}
463+
if cmd.healthCheck {
464+
needsHTTPServer = true
465+
hc := healthcheck.NewCheck(p, cmd.logger)
466+
mux.HandleFunc("/startup", hc.HandleStartup)
467+
mux.HandleFunc("/readiness", hc.HandleReadiness)
468+
mux.HandleFunc("/liveness", hc.HandleLiveness)
469+
notify = hc.NotifyStarted
470+
}
471+
472+
// Start the HTTP server if anything requiring HTTP is specified.
473+
if needsHTTPServer {
474+
server := &http.Server{
475+
Addr: fmt.Sprintf("localhost:%s", cmd.httpPort),
476+
Handler: mux,
477+
}
478+
// Start the HTTP server.
479+
go func() {
480+
err := server.ListenAndServe()
481+
if err == http.ErrServerClosed {
482+
return
483+
}
484+
if err != nil {
485+
shutdownCh <- fmt.Errorf("failed to start HTTP server: %v", err)
486+
}
487+
}()
488+
// Handle shutdown of the HTTP server gracefully.
489+
go func() {
490+
select {
491+
case <-ctx.Done():
492+
// Give the HTTP server a second to shutdown cleanly.
493+
ctx2, cancel := context.WithTimeout(context.Background(), time.Second)
494+
defer cancel()
495+
if err := server.Shutdown(ctx2); err != nil {
496+
cmd.logger.Errorf("failed to shutdown Prometheus HTTP server: %v\n", err)
497+
}
498+
}
499+
}()
500+
}
501+
502+
go func() { shutdownCh <- p.Serve(ctx, notify) }()
480503

481504
err := <-shutdownCh
482505
switch {

cmd/root_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -516,10 +516,6 @@ func TestNewCommandWithErrors(t *testing.T) {
516516
desc: "when the iam authn login query param is bogus",
517517
args: []string{"proj:region:inst?auto-iam-authn=nope"},
518518
},
519-
{
520-
desc: "enabling a Prometheus port without a namespace",
521-
args: []string{"--http-port", "1111", "proj:region:inst"},
522-
},
523519
{
524520
desc: "using an invalid url for sqladmin-api-endpoint",
525521
args: []string{"--sqladmin-api-endpoint", "https://user:abc{DEf1=ghi@example.com:5432/db?sslmode=require", "proj:region:inst"},
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Copyright 2022 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package healthcheck tests and communicates the health of the Cloud SQL Auth proxy.
16+
package healthcheck
17+
18+
import (
19+
"context"
20+
"errors"
21+
"fmt"
22+
"net/http"
23+
"sync"
24+
25+
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/cloudsql"
26+
"github.com/GoogleCloudPlatform/cloudsql-proxy/v2/internal/proxy"
27+
)
28+
29+
// Check provides HTTP handlers for use as healthchecks typically in a
30+
// Kubernetes context.
31+
type Check struct {
32+
once *sync.Once
33+
started chan struct{}
34+
proxy *proxy.Client
35+
logger cloudsql.Logger
36+
}
37+
38+
// NewCheck is the initializer for Check.
39+
func NewCheck(p *proxy.Client, l cloudsql.Logger) *Check {
40+
return &Check{
41+
once: &sync.Once{},
42+
started: make(chan struct{}),
43+
proxy: p,
44+
logger: l,
45+
}
46+
}
47+
48+
// NotifyStarted notifies the check that the proxy has started up successfully.
49+
func (c *Check) NotifyStarted() {
50+
c.once.Do(func() { close(c.started) })
51+
}
52+
53+
// HandleStartup reports whether the Check has been notified of startup.
54+
func (c *Check) HandleStartup(w http.ResponseWriter, _ *http.Request) {
55+
select {
56+
case <-c.started:
57+
w.WriteHeader(http.StatusOK)
58+
w.Write([]byte("ok"))
59+
default:
60+
w.WriteHeader(http.StatusServiceUnavailable)
61+
w.Write([]byte("error"))
62+
}
63+
}
64+
65+
var errNotStarted = errors.New("proxy is not started")
66+
67+
// HandleReadiness ensures the Check has been notified of successful startup,
68+
// that the proxy has not reached maximum connections, and that all connections
69+
// are healthy.
70+
func (c *Check) HandleReadiness(w http.ResponseWriter, _ *http.Request) {
71+
ctx, cancel := context.WithCancel(context.Background())
72+
defer cancel()
73+
74+
select {
75+
case <-c.started:
76+
default:
77+
c.logger.Errorf("[Health Check] Readiness failed: %v", errNotStarted)
78+
w.WriteHeader(http.StatusServiceUnavailable)
79+
w.Write([]byte(errNotStarted.Error()))
80+
return
81+
}
82+
83+
if open, max := c.proxy.ConnCount(); max > 0 && open == max {
84+
err := fmt.Errorf("max connections reached (open = %v, max = %v)", open, max)
85+
c.logger.Errorf("[Health Check] Readiness failed: %v", err)
86+
w.WriteHeader(http.StatusServiceUnavailable)
87+
w.Write([]byte(err.Error()))
88+
return
89+
}
90+
91+
err := c.proxy.CheckConnections(ctx)
92+
if err != nil {
93+
c.logger.Errorf("[Health Check] Readiness failed: %v", err)
94+
w.WriteHeader(http.StatusServiceUnavailable)
95+
w.Write([]byte(err.Error()))
96+
return
97+
}
98+
99+
w.WriteHeader(http.StatusOK)
100+
w.Write([]byte("ok"))
101+
}
102+
103+
// HandleLiveness indicates the process is up and responding to HTTP requests.
104+
// If this check fails (because it's not reachable), the process is in a bad
105+
// state and should be restarted.
106+
func (c *Check) HandleLiveness(w http.ResponseWriter, _ *http.Request) {
107+
w.WriteHeader(http.StatusOK)
108+
w.Write([]byte("ok"))
109+
}

0 commit comments

Comments
 (0)