Skip to content

Commit

Permalink
metrics: improve reading Go runtime metrics (ethereum#25886)
Browse files Browse the repository at this point in the history
This changes how we read performance metrics from the Go runtime. Instead
of using runtime.ReadMemStats, we now rely on the API provided by package
runtime/metrics.

runtime/metrics provides more accurate information. For example, the new
interface has better reporting of memory use. In my testing, the reported
value of held memory more accurately reflects the usage reported by the OS.

The semantics of metrics system/memory/allocs and system/memory/frees have
changed to report amounts in bytes. ReadMemStats only reported the count of
allocations in number-of-objects. This is imprecise: 'tiny objects' are not
counted because the runtime allocates them in batches; and certain
improvements in allocation behavior, such as struct size optimizations,
will be less visible when the number of allocs doesn't change.

Changing allocation reports to be in bytes makes it appear in graphs that
lots more is being allocated. I don't think that's a problem because this
metric is primarily interesting for geth developers.

The metric system/memory/pauses has been changed to report statistical
values from the histogram provided by the runtime. Its name in influxdb has
changed from geth.system/memory/pauses.meter to
geth.system/memory/pauses.histogram.

We also have a new histogram metric, system/cpu/schedlatency, reporting the
Go scheduler latency.
  • Loading branch information
gzliudan authored and JukLee0ira committed Dec 16, 2024
1 parent a201d95 commit 4e27af2
Show file tree
Hide file tree
Showing 11 changed files with 602 additions and 431 deletions.
35 changes: 18 additions & 17 deletions metrics/influxdb/influxdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,27 +135,28 @@ func (r *reporter) send() error {
})
case metrics.Histogram:
ms := metric.Snapshot()

if ms.Count() > 0 {
ps := ms.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999})
ps := ms.Percentiles([]float64{0.25, 0.5, 0.75, 0.95, 0.99, 0.999, 0.9999})
fields := map[string]interface{}{
"count": ms.Count(),
"max": ms.Max(),
"mean": ms.Mean(),
"min": ms.Min(),
"stddev": ms.StdDev(),
"variance": ms.Variance(),
"p25": ps[0],
"p50": ps[1],
"p75": ps[2],
"p95": ps[3],
"p99": ps[4],
"p999": ps[5],
"p9999": ps[6],
}
pts = append(pts, client.Point{
Measurement: fmt.Sprintf("%s%s.histogram", namespace, name),
Tags: r.tags,
Fields: map[string]interface{}{
"count": ms.Count(),
"max": ms.Max(),
"mean": ms.Mean(),
"min": ms.Min(),
"stddev": ms.StdDev(),
"variance": ms.Variance(),
"p50": ps[0],
"p75": ps[1],
"p95": ps[2],
"p99": ps[3],
"p999": ps[4],
"p9999": ps[5],
},
Time: now,
Fields: fields,
Time: now,
})
}
case metrics.Meter:
Expand Down
176 changes: 127 additions & 49 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ package metrics

import (
"os"
"runtime"
"runtime/metrics"
"runtime/pprof"
"strings"
"time"

Expand Down Expand Up @@ -54,73 +55,150 @@ func init() {
}
}

// CollectProcessMetrics periodically collects various metrics about the running
// process.
var threadCreateProfile = pprof.Lookup("threadcreate")

type runtimeStats struct {
GCPauses *metrics.Float64Histogram
GCAllocBytes uint64
GCFreedBytes uint64

MemTotal uint64
HeapObjects uint64
HeapFree uint64
HeapReleased uint64
HeapUnused uint64

Goroutines uint64
SchedLatency *metrics.Float64Histogram
}

var runtimeSamples = []metrics.Sample{
{Name: "/gc/pauses:seconds"}, // histogram
{Name: "/gc/heap/allocs:bytes"},
{Name: "/gc/heap/frees:bytes"},
{Name: "/memory/classes/total:bytes"},
{Name: "/memory/classes/heap/objects:bytes"},
{Name: "/memory/classes/heap/free:bytes"},
{Name: "/memory/classes/heap/released:bytes"},
{Name: "/memory/classes/heap/unused:bytes"},
{Name: "/sched/goroutines:goroutines"},
{Name: "/sched/latencies:seconds"}, // histogram
}

func readRuntimeStats(v *runtimeStats) {
metrics.Read(runtimeSamples)
for _, s := range runtimeSamples {
// Skip invalid/unknown metrics. This is needed because some metrics
// are unavailable in older Go versions, and attempting to read a 'bad'
// metric panics.
if s.Value.Kind() == metrics.KindBad {
continue
}

switch s.Name {
case "/gc/pauses:seconds":
v.GCPauses = s.Value.Float64Histogram()
case "/gc/heap/allocs:bytes":
v.GCAllocBytes = s.Value.Uint64()
case "/gc/heap/frees:bytes":
v.GCFreedBytes = s.Value.Uint64()
case "/memory/classes/total:bytes":
v.MemTotal = s.Value.Uint64()
case "/memory/classes/heap/objects:bytes":
v.HeapObjects = s.Value.Uint64()
case "/memory/classes/heap/free:bytes":
v.HeapFree = s.Value.Uint64()
case "/memory/classes/heap/released:bytes":
v.HeapReleased = s.Value.Uint64()
case "/memory/classes/heap/unused:bytes":
v.HeapUnused = s.Value.Uint64()
case "/sched/goroutines:goroutines":
v.Goroutines = s.Value.Uint64()
case "/sched/latencies:seconds":
v.SchedLatency = s.Value.Float64Histogram()
}
}
}

// CollectProcessMetrics periodically collects various metrics about the running process.
func CollectProcessMetrics(refresh time.Duration) {
// Short circuit if the metrics system is disabled
if !Enabled {
return
}

refreshFreq := int64(refresh / time.Second)

// Create the various data collectors
cpuStats := make([]*CPUStats, 2)
memstats := make([]*runtime.MemStats, 2)
diskstats := make([]*DiskStats, 2)
for i := 0; i < len(memstats); i++ {
cpuStats[i] = new(CPUStats)
memstats[i] = new(runtime.MemStats)
diskstats[i] = new(DiskStats)
}
// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)

memPauses = GetOrRegisterMeter("system/memory/pauses", DefaultRegistry)
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
memHeld = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
memUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
cpustats = make([]CPUStats, 2)
diskstats = make([]DiskStats, 2)
rstats = make([]runtimeStats, 2)
)

// This scale factor is used for the runtime's time metrics. It's useful to convert to
// ns here because the runtime gives times in float seconds, but runtimeHistogram can
// only provide integers for the minimum and maximum values.
const secondsToNs = float64(time.Second)

// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil)
memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry)
memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry)
memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry)
heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry)
heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry)
diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry)
diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry)
diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry)
diskWrites = GetOrRegisterMeter("system/disk/writecount", DefaultRegistry)
diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry)
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
)
// Iterate loading the different stats and updating the meters
for i := 1; ; i++ {
location1 := i % 2
location2 := (i - 1) % 2

ReadCPUStats(cpuStats[location1])
cpuSysLoad.Update((cpuStats[location1].GlobalTime - cpuStats[location2].GlobalTime) / refreshFreq)
cpuSysWait.Update((cpuStats[location1].GlobalWait - cpuStats[location2].GlobalWait) / refreshFreq)
cpuProcLoad.Update((cpuStats[location1].LocalTime - cpuStats[location2].LocalTime) / refreshFreq)

// Iterate loading the different stats and updating the meters.
now, prev := 0, 1
for ; ; now, prev = prev, now {
// CPU
ReadCPUStats(&cpustats[now])
cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq)
cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq)
cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq)

// Threads
cpuThreads.Update(int64(threadCreateProfile.Count()))
cpuGoroutines.Update(int64(runtime.NumGoroutine()))

runtime.ReadMemStats(memstats[location1])
memPauses.Mark(int64(memstats[location1].PauseTotalNs - memstats[location2].PauseTotalNs))
memAllocs.Mark(int64(memstats[location1].Mallocs - memstats[location2].Mallocs))
memFrees.Mark(int64(memstats[location1].Frees - memstats[location2].Frees))
memHeld.Update(int64(memstats[location1].HeapSys - memstats[location1].HeapReleased))
memUsed.Update(int64(memstats[location1].Alloc))

if ReadDiskStats(diskstats[location1]) == nil {
diskReads.Mark(diskstats[location1].ReadCount - diskstats[location2].ReadCount)
diskReadBytes.Mark(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes)
diskWrites.Mark(diskstats[location1].WriteCount - diskstats[location2].WriteCount)
diskWriteBytes.Mark(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes)

diskReadBytesCounter.Inc(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes)
diskWriteBytesCounter.Inc(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes)

// Go runtime metrics
readRuntimeStats(&rstats[now])

cpuGoroutines.Update(int64(rstats[now].Goroutines))
cpuSchedLatency.update(rstats[now].SchedLatency)
memPauses.update(rstats[now].GCPauses)

memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes))
memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes))

memTotal.Update(int64(rstats[now].MemTotal))
heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased))
heapObjects.Update(int64(rstats[now].HeapObjects))

// Disk
if ReadDiskStats(&diskstats[now]) == nil {
diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount)
diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount)
diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes)
diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes)
}

time.Sleep(refresh)
}
}
32 changes: 5 additions & 27 deletions metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,18 @@ package metrics

import (
"fmt"
"io"
"log"
"sync"
"testing"
"time"
)

const FANOUT = 128

// Stop the compiler from complaining during debugging.
var (
_ = io.Discard
_ = log.LstdFlags
)
func TestReadRuntimeValues(t *testing.T) {
var v runtimeStats
readRuntimeStats(&v)
t.Logf("%+v", v)
}

func BenchmarkMetrics(b *testing.B) {
r := NewRegistry()
Expand All @@ -26,7 +24,6 @@ func BenchmarkMetrics(b *testing.B) {
m := NewRegisteredMeter("meter", r)
t := NewRegisteredTimer("timer", r)
RegisterDebugGCStats(r)
RegisterRuntimeMemStats(r)
b.ResetTimer()
ch := make(chan bool)

Expand All @@ -48,24 +45,6 @@ func BenchmarkMetrics(b *testing.B) {
}()
//*/

wgR := &sync.WaitGroup{}
//*
wgR.Add(1)
go func() {
defer wgR.Done()
//log.Println("go CaptureRuntimeMemStats")
for {
select {
case <-ch:
//log.Println("done CaptureRuntimeMemStats")
return
default:
CaptureRuntimeMemStatsOnce(r)
}
}
}()
//*/

wgW := &sync.WaitGroup{}
/*
wgW.Add(1)
Expand Down Expand Up @@ -104,7 +83,6 @@ func BenchmarkMetrics(b *testing.B) {
wg.Wait()
close(ch)
wgD.Wait()
wgR.Wait()
wgW.Wait()
}

Expand Down
Loading

0 comments on commit 4e27af2

Please sign in to comment.