Skip to content

Commit ed6596f

Browse files
committed
feat(prom-client) add implementation for collecting event loop lag, garbage collector, heap size and heap space
1 parent a288410 commit ed6596f

File tree

9 files changed

+675
-52
lines changed

9 files changed

+675
-52
lines changed

plugins/node/instrumentation-runtime-node/src/instrumentation.ts

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -13,83 +13,70 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16-
import { EventLoopUtilization, performance } from 'node:perf_hooks';
17-
const { eventLoopUtilization } = performance;
18-
1916
import { InstrumentationBase } from '@opentelemetry/instrumentation';
2017

2118
import { VERSION } from './version';
2219
import { RuntimeNodeInstrumentationConfig } from './types';
20+
import { MetricCollector } from './types/metricCollector';
21+
import { EventLoopUtilizationCollector } from './metrics/eventLoopUtilizationCollector';
22+
import { EventLoopLagCollector } from './metrics/eventLoopLagCollector';
23+
import { GCCollector } from './metrics/gcCollector';
24+
import { HeapSizeAndUsedCollector } from './metrics/heapSizeAndUsedCollector';
25+
import { HeapSpacesSizeAndUsedCollector } from './metrics/heapSpacesSizeAndUsedCollector';
2326

24-
const ELUS_LENGTH = 2;
2527
const DEFAULT_CONFIG: RuntimeNodeInstrumentationConfig = {
26-
eventLoopUtilizationMeasurementInterval: 5000,
28+
monitoringPrecision: 5000,
2729
};
2830

31+
const namePrefix = 'nodejs';
32+
2933
export class RuntimeNodeInstrumentation extends InstrumentationBase {
30-
private _ELUs: EventLoopUtilization[] = [];
31-
private _interval: NodeJS.Timeout | undefined;
34+
private _collectors: MetricCollector[] = [];
3235

3336
constructor(config: RuntimeNodeInstrumentationConfig = {}) {
3437
super(
3538
'@opentelemetry/instrumentation-runtime-node',
3639
VERSION,
3740
Object.assign({}, DEFAULT_CONFIG, config)
3841
);
39-
}
40-
41-
private _addELU() {
42-
this._ELUs.unshift(eventLoopUtilization());
43-
if (this._ELUs.length > ELUS_LENGTH) {
44-
this._ELUs.pop();
42+
this._collectors = [
43+
new EventLoopUtilizationCollector(this._config, namePrefix),
44+
new EventLoopLagCollector(this._config, namePrefix),
45+
new GCCollector(this._config, namePrefix),
46+
new HeapSizeAndUsedCollector(this._config, namePrefix),
47+
new HeapSpacesSizeAndUsedCollector(this._config, namePrefix),
48+
];
49+
if (this._config.enabled) {
50+
for (const collector of this._collectors) {
51+
collector.enable();
52+
}
4553
}
4654
}
4755

48-
private _clearELU() {
49-
if (!this._ELUs) {
50-
this._ELUs = [];
51-
}
52-
this._ELUs.length = 0;
53-
}
54-
5556
// Called when a new `MeterProvider` is set
5657
// the Meter (result of @opentelemetry/api's getMeter) is available as this.meter within this method
5758
override _updateMetricInstruments() {
58-
this.meter
59-
.createObservableGauge('nodejs.event_loop.utilization', {
60-
description: 'Event loop utilization',
61-
unit: '1',
62-
})
63-
.addCallback(async observableResult => {
64-
if (this._ELUs.length !== ELUS_LENGTH) {
65-
return;
66-
}
67-
const elu = eventLoopUtilization(...this._ELUs);
68-
observableResult.observe(elu.utilization);
69-
});
59+
if (!this._collectors) return;
60+
for (const collector of this._collectors) {
61+
collector.updateMetricInstruments(this.meter);
62+
}
7063
}
7164

7265
init() {
7366
// Not instrumenting or patching a Node.js module
7467
}
7568

7669
override enable() {
77-
this._clearELU();
78-
this._addELU();
79-
clearInterval(this._interval);
80-
this._interval = setInterval(
81-
() => this._addELU(),
82-
(this._config as RuntimeNodeInstrumentationConfig)
83-
.eventLoopUtilizationMeasurementInterval
84-
);
70+
if (!this._collectors) return;
8571

86-
// unref so that it does not keep the process running if disable() is never called
87-
this._interval?.unref();
72+
for (const collector of this._collectors) {
73+
collector.enable();
74+
}
8875
}
8976

9077
override disable() {
91-
this._clearELU();
92-
clearInterval(this._interval);
93-
this._interval = undefined;
78+
for (const collector of this._collectors) {
79+
collector.disable();
80+
}
9481
}
9582
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
import { MetricCollector } from '../types/metricCollector';
17+
import { Meter } from '@opentelemetry/api';
18+
import { clearInterval } from 'node:timers';
19+
import { RuntimeNodeInstrumentationConfig } from '../types';
20+
21+
export abstract class BaseCollector<T> implements MetricCollector {
22+
protected _config: RuntimeNodeInstrumentationConfig = {};
23+
24+
protected namePrefix: string;
25+
private _interval: NodeJS.Timeout | undefined;
26+
protected _scrapeQueue: T[] = [];
27+
28+
constructor(
29+
config: RuntimeNodeInstrumentationConfig = {},
30+
namePrefix: string
31+
) {
32+
this._config = config;
33+
this.namePrefix = namePrefix;
34+
}
35+
36+
public disable(): void {
37+
this._clearQueue();
38+
clearInterval(this._interval);
39+
this._interval = undefined;
40+
41+
this.internalDisable();
42+
}
43+
44+
public enable(): void {
45+
this._clearQueue();
46+
clearInterval(this._interval);
47+
this._interval = setInterval(
48+
() => this._addTask(),
49+
this._config.monitoringPrecision
50+
);
51+
52+
// unref so that it does not keep the process running if disable() is never called
53+
this._interval?.unref();
54+
55+
this.internalEnable();
56+
}
57+
58+
private _clearQueue() {
59+
this._scrapeQueue.length = 0;
60+
}
61+
62+
private _addTask() {
63+
const taskResult = this.scrape();
64+
if (taskResult) {
65+
this._scrapeQueue.push(taskResult);
66+
}
67+
}
68+
69+
public abstract updateMetricInstruments(meter: Meter): void;
70+
protected abstract internalEnable(): void;
71+
protected abstract internalDisable(): void;
72+
protected abstract scrape(): T;
73+
}
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
import { RuntimeNodeInstrumentationConfig } from '../types';
17+
import { Meter } from '@opentelemetry/api';
18+
import { IntervalHistogram } from 'node:perf_hooks';
19+
import { BaseCollector } from './baseCollector';
20+
import * as perf_hooks from 'node:perf_hooks';
21+
22+
const NODEJS_EVENTLOOP_LAG = 'event_loop.lag_seconds';
23+
const NODEJS_EVENTLOOP_LAG_MIN = 'event_loop.lag_min_seconds';
24+
const NODEJS_EVENTLOOP_LAG_MAX = 'event_loop.lag_max_seconds';
25+
const NODEJS_EVENTLOOP_LAG_MEAN = 'event_loop.lag_mean_seconds';
26+
const NODEJS_EVENTLOOP_LAG_STDDEV = 'event_loop.lag_stddev_seconds';
27+
const NODEJS_EVENTLOOP_LAG_P50 = 'event_loop.lag_p50_seconds';
28+
const NODEJS_EVENTLOOP_LAG_P90 = 'event_loop.lag_p90_seconds';
29+
const NODEJS_EVENTLOOP_LAG_P99 = 'event_loop.lag_p99_seconds';
30+
31+
export const metricNames = [
32+
{ name: NODEJS_EVENTLOOP_LAG, description: 'Lag of event loop in seconds.' },
33+
{
34+
name: NODEJS_EVENTLOOP_LAG_MIN,
35+
description: 'The minimum recorded event loop delay.',
36+
},
37+
{
38+
name: NODEJS_EVENTLOOP_LAG_MAX,
39+
description: 'The maximum recorded event loop delay.',
40+
},
41+
{
42+
name: NODEJS_EVENTLOOP_LAG_MEAN,
43+
description: 'The mean of the recorded event loop delays.',
44+
},
45+
{
46+
name: NODEJS_EVENTLOOP_LAG_STDDEV,
47+
description: 'The standard deviation of the recorded event loop delays.',
48+
},
49+
{
50+
name: NODEJS_EVENTLOOP_LAG_P50,
51+
description: 'The 50th percentile of the recorded event loop delays.',
52+
},
53+
{
54+
name: NODEJS_EVENTLOOP_LAG_P90,
55+
description: 'The 90th percentile of the recorded event loop delays.',
56+
},
57+
{
58+
name: NODEJS_EVENTLOOP_LAG_P99,
59+
description: 'The 99th percentile of the recorded event loop delays.',
60+
},
61+
];
62+
63+
export interface EventLoopLagInformation {
64+
min: number;
65+
max: number;
66+
mean: number;
67+
stddev: number;
68+
p50: number;
69+
p90: number;
70+
p99: number;
71+
}
72+
73+
export class EventLoopLagCollector extends BaseCollector<EventLoopLagInformation> {
74+
private _histogram: IntervalHistogram;
75+
76+
constructor(
77+
config: RuntimeNodeInstrumentationConfig = {},
78+
namePrefix: string
79+
) {
80+
super(config, namePrefix);
81+
this._histogram = perf_hooks.monitorEventLoopDelay({
82+
resolution: config.monitoringPrecision,
83+
});
84+
}
85+
86+
updateMetricInstruments(meter: Meter): void {
87+
const lag = meter.createObservableGauge(
88+
`${this.namePrefix}.${metricNames[0].name}`,
89+
{
90+
description: metricNames[0].description,
91+
unit: '1',
92+
}
93+
);
94+
const lagMin = meter.createObservableGauge(
95+
`${this.namePrefix}.${metricNames[1].name}`,
96+
{
97+
description: metricNames[1].description,
98+
unit: '1',
99+
}
100+
);
101+
const lagMax = meter.createObservableGauge(
102+
`${this.namePrefix}.${metricNames[2].name}`,
103+
{
104+
description: metricNames[2].description,
105+
unit: '1',
106+
}
107+
);
108+
const lagMean = meter.createObservableGauge(
109+
`${this.namePrefix}.${metricNames[3].name}`,
110+
{
111+
description: metricNames[3].description,
112+
unit: '1',
113+
}
114+
);
115+
const lagStddev = meter.createObservableGauge(
116+
`${this.namePrefix}.${metricNames[4].name}`,
117+
{
118+
description: metricNames[4].description,
119+
unit: '1',
120+
}
121+
);
122+
const lagp50 = meter.createObservableGauge(
123+
`${this.namePrefix}.${metricNames[5].name}`,
124+
{
125+
description: metricNames[5].description,
126+
unit: '1',
127+
}
128+
);
129+
const lagp90 = meter.createObservableGauge(
130+
`${this.namePrefix}.${metricNames[6].name}`,
131+
{
132+
description: metricNames[6].description,
133+
unit: '1',
134+
}
135+
);
136+
const lagp99 = meter.createObservableGauge(
137+
`${this.namePrefix}.${metricNames[7].name}`,
138+
{
139+
description: metricNames[7].description,
140+
unit: '1',
141+
}
142+
);
143+
144+
meter.addBatchObservableCallback(
145+
async observableResult => {
146+
if (this._scrapeQueue.length === 0) return;
147+
148+
const data = this._scrapeQueue.shift();
149+
if (data === undefined) return;
150+
151+
const start = process.hrtime();
152+
const lagResult = await new Promise<number>(res => {
153+
setImmediate((start: [number, number]) => {
154+
res(this._reportEventloopLag(start));
155+
}, start);
156+
});
157+
158+
observableResult.observe(lag, lagResult);
159+
observableResult.observe(lagMin, data.min);
160+
observableResult.observe(lagMax, data.max);
161+
observableResult.observe(lagMean, data.mean);
162+
observableResult.observe(lagStddev, data.stddev);
163+
observableResult.observe(lagp50, data.p50);
164+
observableResult.observe(lagp90, data.p90);
165+
observableResult.observe(lagp99, data.p99);
166+
167+
this._histogram.reset();
168+
},
169+
[lag, lagMin, lagMax, lagMean, lagStddev, lagp50, lagp90, lagp99]
170+
);
171+
}
172+
173+
internalEnable(): void {
174+
this._histogram.enable();
175+
}
176+
177+
internalDisable(): void {
178+
this._histogram.disable();
179+
}
180+
181+
protected scrape(): EventLoopLagInformation {
182+
return {
183+
min: this.checkNan(this._histogram.min / 1e9),
184+
max: this.checkNan(this._histogram.max / 1e9),
185+
mean: this.checkNan(this._histogram.mean / 1e9),
186+
stddev: this.checkNan(this._histogram.stddev / 1e9),
187+
p50: this.checkNan(this._histogram.percentile(90) / 1e9),
188+
p90: this.checkNan(this._histogram.percentile(90) / 1e9),
189+
p99: this.checkNan(this._histogram.percentile(99) / 1e9),
190+
};
191+
}
192+
193+
private _reportEventloopLag(start: [number, number]): number {
194+
const delta = process.hrtime(start);
195+
const nanosec = delta[0] * 1e9 + delta[1];
196+
const seconds = nanosec / 1e9;
197+
return seconds;
198+
}
199+
200+
private checkNan(value: number) {
201+
return isNaN(value) ? 0 : value;
202+
}
203+
}

0 commit comments

Comments
 (0)