Skip to content

Commit

Permalink
JMXFetch cycle management on Windows (#5309)
Browse files Browse the repository at this point in the history
Ports the lifecycle management code to Windows. No code changes were needed, since the Linux/Mac code seems to work fine on Windows too.
  • Loading branch information
albertvaka authored Apr 17, 2020
1 parent 1987346 commit 61bdb6b
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 65 deletions.
5 changes: 0 additions & 5 deletions pkg/collector/corechecks/embed/jmx/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
package jmx

import (
"runtime"
"time"

"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
Expand All @@ -30,10 +29,6 @@ func (r *runner) initRunner() {
func (r *runner) startRunner() error {

lifecycleMgmt := true
if runtime.GOOS == "windows" {
lifecycleMgmt = false
}

err := r.jmxfetch.Start(lifecycleMgmt)
if err != nil {
s := status.JMXStartupError{LastError: err.Error(), Timestamp: time.Now().Unix()}
Expand Down
51 changes: 51 additions & 0 deletions pkg/jmxfetch/jmxfetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
api "github.com/DataDog/datadog-agent/pkg/api/util"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/status"
"github.com/DataDog/datadog-agent/pkg/status/health"
"github.com/DataDog/datadog-agent/pkg/util/log"
"gopkg.in/yaml.v2"
Expand Down Expand Up @@ -105,6 +106,56 @@ type checkInitCfg struct {
JavaOptions string `yaml:"java_options,omitempty"`
}

func (j *JMXFetch) Monitor() {
idx := 0
maxRestarts := config.Datadog.GetInt("jmx_max_restarts")
ival := float64(config.Datadog.GetInt("jmx_restart_interval"))
stopTimes := make([]time.Time, maxRestarts)
ticker := time.NewTicker(500 * time.Millisecond)

defer ticker.Stop()
defer close(j.stopped)

go j.heartbeat(ticker)

for {
err := j.Wait()
if err == nil {
log.Infof("JMXFetch stopped and exited sanely.")
break
}

stopTimes[idx] = time.Now()
oldestIdx := (idx + maxRestarts + 1) % maxRestarts

// Please note that the zero value for `time.Time` is `0001-01-01 00:00:00 +0000 UTC`
// therefore for the first iteration (the initial launch attempt), the interval will
// always be biger than ival (jmx_restart_interval). In fact, this sub operation with
// stopTimes here will only start yielding values potentially <= ival _after_ the first
// maxRestarts attempts, which is fine and consistent.
if stopTimes[idx].Sub(stopTimes[oldestIdx]).Seconds() <= ival {
msg := fmt.Sprintf("Too many JMXFetch restarts (%v) in time interval (%vs) - giving up", maxRestarts, ival)
log.Errorf(msg)
s := status.JMXStartupError{LastError: msg, Timestamp: time.Now().Unix()}
status.SetJMXStartupError(s)
return
}

idx = (idx + 1) % maxRestarts

select {
case <-j.shutdown:
return
default:
// restart
log.Warnf("JMXFetch process had to be restarted.")
j.Start(false)
}
}

<-j.shutdown
}

func (j *JMXFetch) setDefaults() {
if j.JavaBinPath == "" {
j.JavaBinPath = defaultJavaBinPath
Expand Down
53 changes: 0 additions & 53 deletions pkg/jmxfetch/jmxfetch_nix.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,66 +9,13 @@
package jmxfetch

import (
"fmt"
"os"
"syscall"
"time"

"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/status"
"github.com/DataDog/datadog-agent/pkg/util/log"
)

func (j *JMXFetch) Monitor() {
idx := 0
maxRestarts := config.Datadog.GetInt("jmx_max_restarts")
ival := float64(config.Datadog.GetInt("jmx_restart_interval"))
stopTimes := make([]time.Time, maxRestarts)
ticker := time.NewTicker(500 * time.Millisecond)

defer ticker.Stop()
defer close(j.stopped)

go j.heartbeat(ticker)

for {
err := j.Wait()
if err == nil {
log.Infof("JMXFetch stopped and exited sanely.")
break
}

stopTimes[idx] = time.Now()
oldestIdx := (idx + maxRestarts + 1) % maxRestarts

// Please note that the zero value for `time.Time` is `0001-01-01 00:00:00 +0000 UTC`
// therefore for the first iteration (the initial launch attempt), the interval will
// always be biger than ival (jmx_restart_interval). In fact, this sub operation with
// stopTimes here will only start yielding values potentially <= ival _after_ the first
// maxRestarts attempts, which is fine and consistent.
if stopTimes[idx].Sub(stopTimes[oldestIdx]).Seconds() <= ival {
msg := fmt.Sprintf("Too many JMXFetch restarts (%v) in time interval (%vs) - giving up", maxRestarts, ival)
log.Errorf(msg)
s := status.JMXStartupError{LastError: msg, Timestamp: time.Now().Unix()}
status.SetJMXStartupError(s)
return
}

idx = (idx + 1) % maxRestarts

select {
case <-j.shutdown:
return
default:
// restart
log.Warnf("JMXFetch process had to be restarted.")
j.Start(false)
}
}

<-j.shutdown
}

// Stop stops the JMXFetch process
func (j *JMXFetch) Stop() error {
var stopChan chan struct{}
Expand Down
20 changes: 13 additions & 7 deletions pkg/jmxfetch/jmxfetch_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,26 @@ import (
"github.com/DataDog/datadog-agent/pkg/util/log"
)

func (j *JMXFetch) Monitor() {}

// Stop stops the JMXFetch process
func (j *JMXFetch) Stop() error {
var stopChan chan struct{}

err := j.cmd.Process.Kill()
if err != nil {
return err
}

stopChan := make(chan struct{})
go func() {
j.Wait()
close(stopChan)
}()
if j.managed {
stopChan = j.stopped
close(j.shutdown)
} else {
stopChan = make(chan struct{})

go func() {
j.Wait()
close(stopChan)
}()
}

select {
case <-time.After(time.Millisecond * 1000):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
JMXFetch (helper for JMX checks) is now restarted if it crashes on Windows.

0 comments on commit 61bdb6b

Please sign in to comment.