IPC channel stops delivering messages to cluster workers #9706
Closed
Description
- Version: 6.9.1 (also reproduced on 4.4.1 and 0.12.4)
- Platform: OS X, possibly others (Unable to repro on Linux but I believe it does happen but with less frequency)
- Subsystem: Cluster / IPC
When many IPC messages are sent between the master process and cluster workers, IPC channels to workers stop delivering messages. I have not been unable to restore working functionality of the workers and so they must be killed to resolve the issue. Since IPC has stopped working, simply using Worker.destroy()
does not work since the method will wait for the disconnect
event which never arrives (because of this issue).
I am able to repro on OS X by running the following script:
var cluster = require('cluster');
var express = require('express'); // tested with 4.14.0
const workerCount = 2;
const WTMIPC = 25;
const MTWIPC = 25;
if (cluster.isMaster) {
var workers = {}, worker;
for (var i = 0; i < workerCount; i++) {
worker = cluster.fork({});
workers[worker.process.pid] = worker;
}
var workerPongReceivedTime = {};
cluster.on('online', function(worker) {
worker.on('message', function(message) {
var currentTime = Date.now();
if (message.type === 'pong') {
workerPongReceivedTime[worker.process.pid] = currentTime;
console.log('received pong\tmaster-to-worker\t' + (message.timeReceived - message.timeSent) + '\tworker-to-master ' + (currentTime - message.timeSent));
} else if (message.type === 'fromEndpoint') {
for (var i = 0; i < MTWIPC; i++) {
worker.send({ type: 'toWorker' });
}
}
});
});
setInterval(function() {
var currentTime = Date.now();
console.log('sending ping');
Object.keys(workers).forEach(function(workerPid) {
workers[workerPid].send({ type: 'ping', time: Date.now() });
if (currentTime - workerPongReceivedTime[workerPid] > 10000) {
console.log('Worker missed pings: ' + workerPid);
}
});
}, 1000);
} else {
var app = express();
app.get('/test', function(req, res) {
for (i = 0; i < WTMIPC; i++) {
process.send({ type: 'fromEndpoint' });
}
res.send({ test: 123 });
});
app.listen(7080, function() {
console.log('server started');
});
process.on('message', function(message) {
if (message.type === 'ping') {
process.send({ type: 'pong', timeSent: message.time, timeReceived: Date.now() });
}
});
}
and using ApacheBench to place the server under load as follows:
ab -n 100000 -c 200 'http://localhost:7080/test'
I see the following, for example:
server started
server started
sending ping
received pong master-to-worker 1 worker-to-master 1
received pong master-to-worker 0 worker-to-master 1
sending ping
received pong master-to-worker 1 worker-to-master 3
received pong master-to-worker 19 worker-to-master 21
sending ping
received pong master-to-worker 2 worker-to-master 5
received pong master-to-worker 4 worker-to-master 7
sending ping
received pong master-to-worker 3 worker-to-master 4
received pong master-to-worker 4 worker-to-master 6
sending ping
received pong master-to-worker 9 worker-to-master 10
received pong master-to-worker 2 worker-to-master 10
sending ping
received pong master-to-worker 2 worker-to-master 4
received pong master-to-worker 4 worker-to-master 6
sending ping
received pong master-to-worker 2 worker-to-master 4
received pong master-to-worker 4 worker-to-master 6
... (about 10k - 60k requests later) ...
sending ping
sending ping
sending ping
sending ping
sending ping
sending ping
sending ping
sending ping
sending ping
sending ping
Worker missed pings: 97462
sending ping
Worker missed pings: 97462
Worker missed pings: 97463
sending ping
Worker missed pings: 97462
Worker missed pings: 97463
As I alluded to earlier, I have seen an issue on Linux which I believe is related but I have been so far unable to repro using this technique on Linux.