diff --git a/CHANGELOG.md b/CHANGELOG.md index 63611e3e4238f..bc0f3fdd431f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ specifying a docker endpoint to get metrics from. - [#440](https://github.com/influxdata/telegraf/issues/440): Don't query filtered devices for disk stats. - [#463](https://github.com/influxdata/telegraf/issues/463): Docker plugin not working on AWS Linux - [#568](https://github.com/influxdata/telegraf/issues/568): Multiple output race condition. +- [#585](https://github.com/influxdata/telegraf/pull/585): Log stack trace and continue on Telegraf panic. Thanks @wutaizeng! ## v0.10.0 [2016-01-12] diff --git a/agent.go b/agent.go index d0f82145e116f..ee5f45029de2d 100644 --- a/agent.go +++ b/agent.go @@ -7,6 +7,7 @@ import ( "math/big" "math/rand" "os" + "runtime" "sync" "time" @@ -87,6 +88,18 @@ func (a *Agent) Close() error { return err } +func panicRecover(input *models.RunningInput) { + if err := recover(); err != nil { + trace := make([]byte, 2048) + runtime.Stack(trace, true) + log.Printf("FATAL: Input [%s] panicked: %s, Stack:\n%s\n", + input.Name, err, trace) + log.Println("PLEASE REPORT THIS PANIC ON GITHUB with " + + "stack trace, configuration, and OS information: " + + "https://github.com/influxdata/telegraf/issues/new") + } +} + // gatherParallel runs the inputs that are using the same reporting interval // as the telegraf agent. func (a *Agent) gatherParallel(pointChan chan *client.Point) error { @@ -103,6 +116,7 @@ func (a *Agent) gatherParallel(pointChan chan *client.Point) error { wg.Add(1) counter++ go func(input *models.RunningInput) { + defer panicRecover(input) defer wg.Done() acc := NewAccumulator(input.Config, pointChan) @@ -148,6 +162,8 @@ func (a *Agent) gatherSeparate( input *models.RunningInput, pointChan chan *client.Point, ) error { + defer panicRecover(input) + ticker := time.NewTicker(input.Config.Interval) for {