Skip to content

Commit

Permalink
Add timeouts to HealthChecks and retry checks
Browse files Browse the repository at this point in the history
Fixes issue kubernetes#3532. Added timeouts for HTTP and TCP checks
and enabled kubelet/probe to kubelet#maxRetries times
before declaring Failure.

Added Probe.TimeoutSecs to API

Probe variants now check container.LivenessProbe.TimeoutSeconds
Also added a test for timeouts in http_test.go.
  • Loading branch information
commonlisp committed Feb 5, 2015
1 parent e335e2d commit e8c33b7
Show file tree
Hide file tree
Showing 12 changed files with 60 additions and 21 deletions.
2 changes: 2 additions & 0 deletions pkg/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ type Probe struct {
Handler `json:",inline"`
// Length of time before health checking is activated. In seconds.
InitialDelaySeconds int64 `json:"initialDelaySeconds,omitempty"`
// Length of time before health checking times out. In seconds.
TimeoutSeconds int64 `json:"timeoutSeconds,omitempty"`
}

// PullPolicy describes a policy for if/when to pull a container image
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/v1beta1/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,7 @@ func init() {
return err
}
out.InitialDelaySeconds = in.InitialDelaySeconds
out.TimeoutSeconds = in.TimeoutSeconds
return nil
},
func(in *LivenessProbe, out *newer.Probe, s conversion.Scope) error {
Expand All @@ -924,6 +925,7 @@ func init() {
return err
}
out.InitialDelaySeconds = in.InitialDelaySeconds
out.TimeoutSeconds = in.TimeoutSeconds
return nil
},
)
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ type LivenessProbe struct {
Exec *ExecAction `json:"exec,omitempty" description:"parameters for exec-based liveness probe"`
// Length of time before health checking is activated. In seconds.
InitialDelaySeconds int64 `json:"initialDelaySeconds,omitempty" description:"number of seconds after the container has started before liveness probes are initiated"`
// Length of time before health checking times out. In seconds.
TimeoutSeconds int64 `json:"timeoutSeconds,omitempty" description:"number of seconds after which liveness probes timeout; defaults to 1 second"`
}

// PullPolicy describes a policy for if/when to pull a container image
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/v1beta2/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,7 @@ func init() {
return err
}
out.InitialDelaySeconds = in.InitialDelaySeconds
out.TimeoutSeconds = in.TimeoutSeconds
return nil
},
func(in *LivenessProbe, out *newer.Probe, s conversion.Scope) error {
Expand All @@ -837,6 +838,7 @@ func init() {
return err
}
out.InitialDelaySeconds = in.InitialDelaySeconds
out.TimeoutSeconds = in.TimeoutSeconds
return nil
},
)
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/v1beta2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ type LivenessProbe struct {
Exec *ExecAction `json:"exec,omitempty" description:"parameters for exec-based liveness probe"`
// Length of time before health checking is activated. In seconds.
InitialDelaySeconds int64 `json:"initialDelaySeconds,omitempty" description:"number of seconds after the container has started before liveness probes are initiated"`
// Length of time before health checking times out. In seconds.
TimeoutSeconds int64 `json:"timeoutSeconds,omitempty" description:"number of seconds after which liveness probes timeout; defaults to 1 second"`
}

// PullPolicy describes a policy for if/when to pull a container image
Expand Down
2 changes: 2 additions & 0 deletions pkg/api/v1beta3/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ type Probe struct {
Handler `json:",inline"`
// Length of time before health checking is activated. In seconds.
InitialDelaySeconds int64 `json:"initialDelaySeconds,omitempty"`
// Length of time before health checking times out. In seconds.
TimeoutSeconds int64 `json:"timeoutSeconds,omitempty"`
}

// PullPolicy describes a policy for if/when to pull a container image
Expand Down
11 changes: 9 additions & 2 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const defaultChanSize = 1024
const minShares = 2
const sharesPerCPU = 1024
const milliCPUToCPU = 1000
const maxRetries int = 3

// SyncHandler is an interface implemented by Kubelet, for testability
type SyncHandler interface {
Expand Down Expand Up @@ -1417,15 +1418,21 @@ func (kl *Kubelet) GetPodStatus(podFullName string, uid types.UID) (api.PodStatu
return podStatus, err
}

func (kl *Kubelet) probeLiveness(podFullName string, podUID types.UID, status api.PodStatus, container api.Container, dockerContainer *docker.APIContainers) (probe.Status, error) {
func (kl *Kubelet) probeLiveness(podFullName string, podUID types.UID, status api.PodStatus, container api.Container, dockerContainer *docker.APIContainers) (healthStatus probe.Status, err error) {
// Give the container 60 seconds to start up.
if container.LivenessProbe == nil {
return probe.Success, nil
}
if time.Now().Unix()-dockerContainer.Created < container.LivenessProbe.InitialDelaySeconds {
return probe.Success, nil
}
return kl.probeContainer(container.LivenessProbe, podFullName, podUID, status, container)
for i := 0; i < maxRetries; i++ {
healthStatus, err = kl.probeContainer(container.LivenessProbe, podFullName, podUID, status, container)
if healthStatus == probe.Success {
return
}
}
return healthStatus, err
}

// Returns logs of current machine.
Expand Down
14 changes: 12 additions & 2 deletions pkg/kubelet/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package kubelet
import (
"fmt"
"strconv"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/probe"
Expand All @@ -39,6 +40,14 @@ var (
)

func (kl *Kubelet) probeContainer(p *api.Probe, podFullName string, podUID types.UID, status api.PodStatus, container api.Container) (probe.Status, error) {
var timeout time.Duration
secs := container.LivenessProbe.TimeoutSeconds
if secs > 0 {
timeout = time.Duration(secs) * time.Second
} else {
timeout = 1 * time.Second
}

if p.Exec != nil {
return execprober.Probe(kl.newExecInContainer(podFullName, podUID, container))
}
Expand All @@ -47,14 +56,15 @@ func (kl *Kubelet) probeContainer(p *api.Probe, podFullName string, podUID types
if err != nil {
return probe.Unknown, err
}
return httprober.Probe(extractGetParams(p.HTTPGet, status, port))
host, port, path := extractGetParams(p.HTTPGet, status, port)
return httprober.Probe(host, port, path, timeout)
}
if p.TCPSocket != nil {
port, err := extractPort(p.TCPSocket.Port, container)
if err != nil {
return probe.Unknown, err
}
return tcprober.Probe(status.PodIP, port)
return tcprober.Probe(status.PodIP, port, timeout)
}
glog.Warningf("Failed to find probe builder for %s %+v", container.Name, container.LivenessProbe)
return probe.Unknown, nil
Expand Down
10 changes: 6 additions & 4 deletions pkg/probe/http/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,25 @@ import (
"net/http"
"net/url"
"strconv"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/probe"

"github.com/golang/glog"
)

func New() HTTPProber {
return HTTPProber{&http.Client{}}
transport := &http.Transport{}
return HTTPProber{transport}
}

type HTTPProber struct {
client HTTPGetInterface
transport *http.Transport
}

// Probe returns a ProbeRunner capable of running an http check.
func (pr *HTTPProber) Probe(host string, port int, path string) (probe.Status, error) {
return DoHTTPProbe(formatURL(host, port, path), pr.client)
func (pr *HTTPProber) Probe(host string, port int, path string, timeout time.Duration) (probe.Status, error) {
return DoHTTPProbe(formatURL(host, port, path), &http.Client{Timeout: timeout, Transport: pr.transport})
}

type HTTPGetInterface interface {
Expand Down
22 changes: 14 additions & 8 deletions pkg/probe/http/http_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/url"
"strconv"
"testing"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/probe"
)
Expand All @@ -46,20 +47,25 @@ func TestFormatURL(t *testing.T) {
}

func TestHTTPProbeChecker(t *testing.T) {
handleReq := func(s int) func(w http.ResponseWriter) {
return func(w http.ResponseWriter) { w.WriteHeader(s) }
}

prober := New()
testCases := []struct {
status int
health probe.Status
handler func(w http.ResponseWriter)
health probe.Status
}{
// The probe will be filled in below. This is primarily testing that an HTTP GET happens.
{http.StatusOK, probe.Success},
{-1, probe.Failure},
{handleReq(http.StatusOK), probe.Success},
{handleReq(-1), probe.Failure},
{func(w http.ResponseWriter) { time.Sleep(3 * time.Second) }, probe.Failure},
}
for _, test := range testCases {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(test.status)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
test.handler(w)
}))
u, err := url.Parse(ts.URL)
u, err := url.Parse(server.URL)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand All @@ -71,7 +77,7 @@ func TestHTTPProbeChecker(t *testing.T) {
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
health, err := prober.Probe(host, p, "")
health, err := prober.Probe(host, p, "", 1*time.Second)
if test.health == probe.Unknown && err == nil {
t.Errorf("Expected error")
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/probe/tcp/tcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package tcp
import (
"net"
"strconv"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/probe"

Expand All @@ -31,16 +32,16 @@ func New() TCPProber {

type TCPProber struct{}

func (pr TCPProber) Probe(host string, port int) (probe.Status, error) {
return DoTCPProbe(net.JoinHostPort(host, strconv.Itoa(port)))
func (pr TCPProber) Probe(host string, port int, timeout time.Duration) (probe.Status, error) {
return DoTCPProbe(net.JoinHostPort(host, strconv.Itoa(port)), timeout)
}

// DoTCPProbe checks that a TCP socket to the address can be opened.
// If the socket can be opened, it returns Success
// If the socket fails to open, it returns Failure.
// This is exported because some other packages may want to do direct TCP probes.
func DoTCPProbe(addr string) (probe.Status, error) {
conn, err := net.Dial("tcp", addr)
func DoTCPProbe(addr string, timeout time.Duration) (probe.Status, error) {
conn, err := net.DialTimeout("tcp", addr, timeout)
if err != nil {
return probe.Failure, nil
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/probe/tcp/tcp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/url"
"strconv"
"testing"
"time"

"github.com/GoogleCloudPlatform/kubernetes/pkg/probe"
)
Expand Down Expand Up @@ -59,7 +60,7 @@ func TestTcpHealthChecker(t *testing.T) {
if !test.usePort {
p = -1
}
status, err := prober.Probe(host, p)
status, err := prober.Probe(host, p, 1*time.Second)
if status != test.expectedStatus {
t.Errorf("expected: %v, got: %v", test.expectedStatus, status)
}
Expand Down

0 comments on commit e8c33b7

Please sign in to comment.