Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement signal handler to avoid async-unsafe functions #97

Closed
wants to merge 7 commits into from
27 changes: 27 additions & 0 deletions core/unix/inc/TUnixSystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,31 @@
#include "TTimer.h"
#endif

#include <thread>
#include <memory>

int StackTraceExec(void *);

typedef void (*SigHandler_t)(ESignals);


class TUnixSystem : public TSystem {

friend int StackTraceExec(void *);

private:
struct StackTraceHelper_t {
static const int fStringLength = 255;
char fShellExec[fStringLength];
char fPidString[fStringLength];
char fPidNum[fStringLength];
int fParentToChild[2];
int fChildToParent[2];
std::unique_ptr<std::thread> fHelperThread;
};

static StackTraceHelper_t fStackTraceHelper;

protected:
const char *FindDynamicLibrary(TString &lib, Bool_t quiet = kFALSE);
const char *GetLinkedLibraries();
Expand Down Expand Up @@ -71,6 +91,12 @@ class TUnixSystem : public TSystem {
static int UnixRecv(int sock, void *buf, int len, int flag);
static int UnixSend(int sock, const void *buf, int len, int flag);

// added helper static members for stacktrace
static char *const kStackArgv[];
static char *const *GetStackArgv();
static void StackTraceHelperThread();
void CachePidInfo();

public:
TUnixSystem();
virtual ~TUnixSystem();
Expand Down Expand Up @@ -121,6 +147,7 @@ class TUnixSystem : public TSystem {
void Abort(int code = 0);
int GetPid();
void StackTrace();
static void StackTraceFromThread();

//---- Directories ------------------------------------------
int MakeDirectory(const char *name);
Expand Down
266 changes: 263 additions & 3 deletions core/unix/src/TUnixSystem.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,15 @@
#include <algorithm>
#include <atomic>

#ifdef __linux__
#include <syscall.h>
#endif

//#define G__OLDEXPAND

#include <unistd.h>
#include <stdlib.h>
#include <poll.h>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not available on windows. Would using select be an option? Otherwise we need to hide this (and code using it) behind
ifndef R__WIN32

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

select could work, but it may be better to just disable the timeouts on Windows; on RHEL7 with some common GCC options, select triggers an assert when the FD is >= 1024.

(Is there a signal-safe read function elsewhere in ROOT that has timeout? Seems silly that it is only used here.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TSystem::Select uses ::select on linux and windows.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - TSystem::Select calculates its own bitmask instead of using FD_SET (see http://sourceforge.net/p/opalvoip/bugs/496/ for a decent explanation). FD_SET is what causes the assert for large FDs.

Unfortunately, we can't use TSystem::Select because it internally uses TFdSet, which can lead to memory allocations (unsafe in the signal handler).

I guess we just have to use select on Windows and fail if fd > 1024.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bbockelm Since fSignals in TSystem has been also defined as TFdSet, should we change the data structure in the new signal handling? For instance, could use linux fd_set instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, fSignals doesn't need to be changed because it isn't invoked from a signal handler.

Unix signal handlers have an extremely limited set of operations they are allowed to do: this is why we must be very careful to not call most ROOT functions.

#include <sys/types.h>
#if defined(R__SUN) || defined(R__AIX) || \
defined(R__LINUX) || defined(R__SOLARIS) || \
Expand Down Expand Up @@ -386,6 +391,85 @@ class TFdSet {
ULong_t *GetBits() { return (ULong_t *)fds_bits; }
};

////////////////////////////////////////////////////////////////////////////////
/// Async-signal-safe Write functions.
static int SignalSafeWrite(int fd, const char *text) {
const char *buffer = text;
size_t count = strlen(text);
ssize_t written = 0;
while (count) {
written = write(fd, buffer, count);
if (written == -1) {
if (errno == EINTR) { continue; }
else { return -errno; }
}
count -= written;
buffer += written;
}
return 0;
}

////////////////////////////////////////////////////////////////////////////////
/// Async-signal-safe Read functions.
static int SignalSafeRead(int fd, char *inbuf, size_t len, int timeout=-1) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing function documentation.

char *buf = inbuf;
size_t count = len;
ssize_t complete = 0;
std::chrono::time_point<std::chrono::steady_clock> endTime = std::chrono::steady_clock::now() + std::chrono::seconds(timeout);
int flags;
if (timeout < 0) {
flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
} else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
return -errno;
} else { }
if ((flags & O_NONBLOCK) != O_NONBLOCK) {
if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
return -errno;
}
}
while (count) {
if (timeout >= 0) {
struct pollfd pollInfo{fd, POLLIN, 0};
int msRemaining = std::chrono::duration_cast<std::chrono::milliseconds>(endTime-std::chrono::steady_clock::now()).count();
if (msRemaining > 0) {
if (poll(&pollInfo, 1, msRemaining) == 0) {
if ((flags & O_NONBLOCK) != O_NONBLOCK) {
fcntl(fd, F_SETFL, flags);
}
return -ETIMEDOUT;
}
} else if (msRemaining < 0) {
if ((flags & O_NONBLOCK) != O_NONBLOCK) {
fcntl(fd, F_SETFL, flags);
}
return -ETIMEDOUT;
} else { }
}
complete = read(fd, buf, count);
if (complete == -1) {
if (errno == EINTR) { continue; }
else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { continue; }
else {
int origErrno = errno;
if ((flags & O_NONBLOCK) != O_NONBLOCK) {
fcntl(fd, F_SETFL, flags);
}
return -origErrno;
}
}
count -= complete;
buf += complete;
}
if ((flags & O_NONBLOCK) != O_NONBLOCK) {
fcntl(fd, F_SETFL, flags);
}
return 0;
}

static int SignalSafeErrWrite(const char *text) {
return SignalSafeWrite(2, text);
}

////////////////////////////////////////////////////////////////////////////////
/// Unix signal handler.

Expand Down Expand Up @@ -569,6 +653,9 @@ TUnixSystem::~TUnixSystem()
////////////////////////////////////////////////////////////////////////////////
/// Initialize Unix system interface.

TUnixSystem::StackTraceHelper_t TUnixSystem::fStackTraceHelper;
char * const TUnixSystem::kStackArgv[] = {TUnixSystem::fStackTraceHelper.fShellExec, TUnixSystem::fStackTraceHelper.fPidString, TUnixSystem::fStackTraceHelper.fPidNum, nullptr};

Bool_t TUnixSystem::Init()
{
if (TSystem::Init())
Expand Down Expand Up @@ -608,6 +695,30 @@ Bool_t TUnixSystem::Init()
gRootDir = ROOTPREFIX;
#endif

if(snprintf(fStackTraceHelper.fShellExec, fStackTraceHelper.fStringLength-1, "/bin/sh") >= fStackTraceHelper.fStringLength) {
SignalSafeErrWrite("Unable to pre-allocate shell command path");
return kFALSE;
}

#ifdef ROOTETCDIR
if(snprintf(fStackTraceHelper.fPidString, fStackTraceHelper.fStringLength-1, "%s/gdb-backtrace.sh", ROOTETCDIR) >= fStackTraceHelper.fStringLength) {
SignalSafeErrWrite("Unable to pre-allocate executable information");
return kFALSE;
}
#else
if(snprintf(fStackTraceHelper.fPidString, fStackTraceHelper.fStringLength-1, "%s/etc/gdb-backtrace.sh", gSystem->Getenv("ROOTSYS")) >= fStackTraceHelper.fStringLength) {
SignalSafeErrWrite("Unable to pre-allocate executable information");
return kFALSE;
}
#endif

fStackTraceHelper.fParentToChild[0] = -1;
fStackTraceHelper.fParentToChild[1] = -1;
fStackTraceHelper.fChildToParent[0] = -1;
fStackTraceHelper.fChildToParent[1] = -1;

CachePidInfo();

return kFALSE;
}

Expand Down Expand Up @@ -2154,10 +2265,8 @@ void TUnixSystem::StackTrace()
}
gdbscript += " ";
}

TString gdbmess = gEnv->GetValue("Root.StacktraceMessage", "");
gdbmess = gdbmess.Strip();

std::cout.flush();
fflush(stdout);

Expand Down Expand Up @@ -2286,7 +2395,6 @@ void TUnixSystem::StackTrace()
fprintf(f, "%s\n", gdbmess.Data());
fclose(f);
}

// use gdb to get stack trace
#ifdef R__MACOSX
gdbscript += GetExePath();
Expand Down Expand Up @@ -3524,6 +3632,7 @@ static void sighandler(int sig)

void TUnixSystem::DispatchSignals(ESignals sig)
{
const char* signalname = "unknown";
switch (sig) {
case kSigAlarm:
DispatchTimers(kFALSE);
Expand All @@ -3532,8 +3641,14 @@ void TUnixSystem::DispatchSignals(ESignals sig)
CheckChilds();
break;
case kSigBus:
signalname = "bus error";
break;
case kSigSegmentationViolation:
signalname = "segmentation violation";
break;
case kSigIllegalInstruction:
signalname = "illegal instruction";
break;
case kSigFloatingException:
Break("TUnixSystem::DispatchSignals", "%s", UnixSigname(sig));
StackTrace();
Expand All @@ -3558,6 +3673,20 @@ void TUnixSystem::DispatchSignals(ESignals sig)
break;
}

if ((sig == kSigIllegalInstruction) || (sig == kSigSegmentationViolation) || (sig == kSigBus))
{

SignalSafeErrWrite("\n\nA fatal system signal has occurred: ");
SignalSafeErrWrite(signalname);
SignalSafeErrWrite("\nThe following is the call stack containing the origin of the signal.\n"
"NOTE:The first few functions on the stack are artifacts of processing the signal and can be ignored\n\n");

TUnixSystem::StackTraceFromThread();

signal(sig, SIG_DFL);
raise(sig);
}

// check a-synchronous signals
if (fSigcnt > 0 && fSignalHandler->GetSize() > 0)
CheckSignals(kFALSE);
Expand Down Expand Up @@ -5148,3 +5277,134 @@ int TUnixSystem::GetProcInfo(ProcInfo_t *info) const

return 0;
}

static void StackTraceFork();

void SetDefaultSignals() {
signal(SIGILL, SIG_DFL);
signal(SIGSEGV, SIG_DFL);
signal(SIGBUS, SIG_DFL);
}

void TUnixSystem::StackTraceHelperThread()
{
int toParent = fStackTraceHelper.fChildToParent[1];
int fromParent = fStackTraceHelper.fParentToChild[0];
char buf[2]; buf[1] = '\0';
while(true) {
int result = SignalSafeRead(fromParent, buf, 1, 5*60);
if (result < 0) {
SetDefaultSignals();
close(toParent);
SignalSafeErrWrite("\n\nTraceback helper thread failed to read from parent: ");
SignalSafeErrWrite(strerror(-result));
SignalSafeErrWrite("\n");
::abort();
}
if (buf[0] == '1') {
SetDefaultSignals();
StackTraceFork();
SignalSafeWrite(toParent, buf);
} else if (buf[0] == '2') {
close(toParent);
close(fromParent);
toParent = fStackTraceHelper.fChildToParent[1];
fromParent = fStackTraceHelper.fParentToChild[0];
} else if (buf[0] == '3') {
break;
} else {
SetDefaultSignals();
close(toParent);
SignalSafeErrWrite("\n\nTraceback helper thread got unknown command from parent: ");
SignalSafeErrWrite(buf);
SignalSafeErrWrite("\n");
::abort();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above - reset signal handler.

}
}
}

void TUnixSystem::StackTraceFromThread()
{
int result = SignalSafeWrite(fStackTraceHelper.fParentToChild[1], "1");
if (result < 0) {
SignalSafeErrWrite("\n\nAttempt to request stacktrace failed: ");
SignalSafeErrWrite(strerror(-result));
SignalSafeErrWrite("\n");
return;
}
char buf[2]; buf[1] = '\0';
if ((result = SignalSafeRead(fStackTraceHelper.fChildToParent[0], buf, 1)) < 0) {
SignalSafeErrWrite("\n\nWaiting for stacktrace completion failed: ");
SignalSafeErrWrite(strerror(-result));
SignalSafeErrWrite("\n");
return;
}
}

void StackTraceFork()
{
static const int stackSize = 4*1024;
char childStack[stackSize];
char *childStackPtr = childStack + stackSize;
int pid =
#ifdef __linux__
clone(StackTraceExec, childStackPtr, CLONE_VM|CLONE_FS|SIGCHLD, nullptr);
#else
fork();
if (childStackPtr) {} // Suppress 'unused variable' warning on non-Linux
if (pid == 0) { StackTraceExec(nullptr); ::abort(); }
#endif
if (pid == -1) {
SignalSafeErrWrite("(Attempt to perform stack dump failed.)\n");
} else {
int status;
if (waitpid(pid, &status, 0) == -1) {
SignalSafeErrWrite("(Failed to wait on stack dump output.)\n");
} else {}
}
}

int StackTraceExec(void *)
{
char *const *argv = TUnixSystem::GetStackArgv();
#ifdef __linux__
syscall(SYS_execve, "/bin/sh", argv, __environ);
#else
execv("/bin/sh", argv);
#endif
::abort();
return 1;
}

char *const *TUnixSystem::GetStackArgv() {
return kStackArgv;
}

void TUnixSystem::CachePidInfo()
{
if(snprintf(fStackTraceHelper.fPidNum, fStackTraceHelper.fStringLength-1, "%d", GetPid()) >= fStackTraceHelper.fStringLength) {
SignalSafeErrWrite("Unable to pre-allocate process id information");
return;
}

close(fStackTraceHelper.fChildToParent[0]);
close(fStackTraceHelper.fChildToParent[1]);
fStackTraceHelper.fChildToParent[0] = -1; fStackTraceHelper.fChildToParent[1] = -1;
close(fStackTraceHelper.fParentToChild[0]);
close(fStackTraceHelper.fParentToChild[1]);
fStackTraceHelper.fParentToChild[0] = -1; fStackTraceHelper.fParentToChild[1] = -1;

if (-1 == pipe2(fStackTraceHelper.fChildToParent, O_CLOEXEC)) {
fprintf(stdout, "pipe fStackTraceHelper.fChildToParent failed\n");
return;
}
if (-1 == pipe2(fStackTraceHelper.fParentToChild, O_CLOEXEC)){
close(fStackTraceHelper.fChildToParent[0]); close(fStackTraceHelper.fChildToParent[1]);
fStackTraceHelper.fChildToParent[0] = -1; fStackTraceHelper.fChildToParent[1] = -1;
fprintf(stdout, "pipe parentToChild failed\n");
return;
}

fStackTraceHelper.fHelperThread.reset(new std::thread(StackTraceHelperThread));
fStackTraceHelper.fHelperThread->detach();
}