-
Notifications
You must be signed in to change notification settings - Fork 242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
s3 worker crashing (IPC ?) #546
Comments
It would be helpful to have the logs that preceded this log entry. It appears that this sort of error happens in node if you try to call disconnect on a worker that is already disconnected. We do not call disconnect directly on workers. We call worker.kill() which does call disconnect. So, I have made some changes here that stops us from calling worker.kill() more than once and worker.process.kill() if there is not already a suicide state: I do not know though if this is what triggered the problem. @bennettbuchanan will try to reproduce and move this forward. |
@nightfox02, can you provide the logs that preceded the log entry above? Searching for 'shutdown of worker due to exception' and 'worker not exiting. killing it' would probably help. |
import http from 'http';
import https from 'https';
import cluster from 'cluster';
import arsenal from 'arsenal';
import { logger } from './utilities/logger';
import { clientCheck } from './utilities/healthcheckHandler';
import _config from './Config';
import routes from './routes';
class S3Server {
/**
* This represents our S3 connector.
* @constructor
* @param {Worker} [worker=null] - Track the worker when using cluster
*/
constructor(worker) {
this.worker = worker;
http.globalAgent.keepAlive = true;
process.on('SIGINT', this.cleanUp.bind(this));
process.on('SIGHUP', this.cleanUp.bind(this));
process.on('SIGQUIT', this.cleanUp.bind(this));
process.on('SIGTERM', this.cleanUp.bind(this));
process.on('SIGPIPE', () => {});
// This will pick up exceptions up the stack
process.on('uncaughtException', err => {
// If just send the error object results in empty
// object on server log.
logger.fatal('caught error', { error: err.message,
stack: err.stack });
this.caughtExceptionShutdown();
});
}
/*
* This starts the http server.
*/
startup(port, ipAddress) {
this.worker.disconnect();
this.worker.disconnect();
// Todo: http.globalAgent.maxSockets, http.globalAgent.maxFreeSockets
if (_config.https) {
this.server = https.createServer({
cert: _config.https.cert,
key: _config.https.key,
ca: _config.https.ca,
ciphers: arsenal.https.ciphers.ciphers,
dhparam: arsenal.https.dhparam.dhparam,
rejectUnauthorized: true,
});
logger.info('Https server configuration', {
https: true,
});
} else {
this.server = http.createServer();
logger.info('Http server configuration', {
https: false,
});
}
this.server.on('connection', socket => {
socket.on('error', err => logger.info('request rejected',
{ error: err }));
});
this.server.on('request', (req, res) => {
// disable nagle algorithm
req.socket.setNoDelay();
routes(req, res, logger);
});
this.server.on('listening', () => {
const addr = this.server.address() || {
address: ipAddress || '[::]',
port,
};
logger.info('server started', { address: addr.address,
port: addr.port, pid: process.pid });
});
if (ipAddress !== undefined) {
this.server.listen(port, ipAddress);
} else {
this.server.listen(port);
}
}
/*
* This exits the running process properly.
*/
cleanUp() {
logger.info('server shutting down');
this.server.close();
process.exit(0);
}
caughtExceptionShutdown() {
logger.error('shutdown of worker due to exception');
// Will close all servers, cause disconnect event on master and kill
// worker process with 'SIGTERM'.
this.worker.kill();
const killTimer = setTimeout(() => {
if (!this.worker.isDead()) {
this.worker.kill('SIGKILL');
}
}, 2000);
killTimer.unref();
}
initiateStartup(log) {
clientCheck(log, (err, results) => {
if (err) {
log.warn('initial health check failed, delaying startup', {
error: err,
healthStatus: results,
});
setTimeout(() => this.initiateStartup(log), 2000);
} else {
log.info('initial health check succeeded', {
healthStatus: results,
});
if (_config.listenOn.length > 0) {
_config.listenOn.forEach(item => {
this.startup(item.port, item.ip);
});
} else {
this.startup(_config.port);
}
}
});
}
}
export default function main() {
let clusters = _config.clusters || 1;
if (process.env.S3BACKEND === 'mem') {
clusters = 1;
}
if (cluster.isMaster) {
for (let n = 0; n < clusters; n++) {
cluster.fork();
}
setInterval(() => {
const len = Object.keys(cluster.workers).length;
if (len < clusters) {
for (let i = len; i < clusters; i++) {
const newWorker = cluster.fork();
logger.error('new worker forked',
{ workerId: newWorker.id });
}
}
}, 1000);
cluster.on('disconnect', worker => {
logger.error('worker disconnected. making sure exits',
{ workerId: worker.id });
setTimeout(() => {
if (!worker.isDead()) {
logger.error('worker not exiting. killing it');
worker.process.kill('SIGKILL');
}
}, 2000);
});
cluster.on('exit', worker => {
logger.error('worker exited.',
{ workerId: worker.id });
});
} else {
const server = new S3Server(cluster.worker);
server.initiateStartup(logger.newRequestLogger());
}
} |
5 seconds prior to IPC event on .105 (majority), on VPN: /home/public/mc/546.seconds.prior.txt s3-0.log.2:{"name":"S3","time":1485102305480,"error":"IPC channel is already disconnected","stack":"Error: IPC channel is already disconnected\n at process.target.disconnect (internal/child_process.js:637:26)\n at Worker. (cluster.js:682:44)\n at process. (cluster.js:749:8)\n at emitTwo (events.js:92:20)\n at process.emit (events.js:172:7)\n at internal/child_process.js:696:12\n at nextTickCallbackWith0Args (node.js:420:9)\n at process._tickCallback (node.js:349:13)","level":"fatal","message":"caught error","hostname":"redacted","pid":14059} 1485102305480 Assuming that this timestamp is in milliseconds: |
Should be mitigated by #578 |
it seems the s3 worker process might be crashing. This could be related to a metadata outage.
The stack trace is :
The text was updated successfully, but these errors were encountered: