diff --git a/.travis.yml b/.travis.yml index 09c865a4..c00b54c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,9 @@ matrix: - env: PLATFORM="x86_64" script: + - make -C docker/vsyscall_emu - docker build --rm -t quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT -f docker/Dockerfile-$PLATFORM docker/ + - docker/vsyscall_emu/test.sh quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT deploy: diff --git a/docker/Dockerfile-x86_64 b/docker/Dockerfile-x86_64 index abcfcb13..7af4267c 100644 --- a/docker/Dockerfile-x86_64 +++ b/docker/Dockerfile-x86_64 @@ -8,9 +8,13 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig +COPY vsyscall_emu/vsyscall_trace /usr/local/sbin/vsyscall_trace + COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && rm -r build_scripts ENV SSL_CERT_FILE=/opt/_internal/certs.pem +ENTRYPOINT ["/usr/local/sbin/vsyscall_trace"] + CMD ["/bin/bash"] diff --git a/docker/vsyscall_emu/.gitignore b/docker/vsyscall_emu/.gitignore new file mode 100644 index 00000000..65edf391 --- /dev/null +++ b/docker/vsyscall_emu/.gitignore @@ -0,0 +1,2 @@ +vsyscall_trace +vsyscall_trace_test diff --git a/docker/vsyscall_emu/Makefile b/docker/vsyscall_emu/Makefile new file mode 100644 index 00000000..5ab29d1c --- /dev/null +++ b/docker/vsyscall_emu/Makefile @@ -0,0 +1,20 @@ +PLATFORM ?= $(shell uname -m) + +ifeq ($(PLATFORM),x86_64) + TARGETS = vsyscall_trace vsyscall_trace_test +else + TARGETS = +endif + +all: $(TARGETS) + +vsyscall_trace: vsyscall_trace.c + $(CC) -o $@ $< -ldl + +vsyscall_trace_test: vsyscall_trace.c + $(CC) -o $@ $< -ldl -DVSYSCALL_BASE=0xfffffffffe600000 -DDEBUG + +clean: + $(RM) $(TARGETS) + +.PHONY: clean diff --git a/docker/vsyscall_emu/test.sh b/docker/vsyscall_emu/test.sh new file mode 100755 index 00000000..78785996 --- /dev/null +++ b/docker/vsyscall_emu/test.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Test vsyscall_trace, either on a docker image if a name is provided or +# directly on the host otherwise. + +set -e + +if [ "${PLATFORM:-$(uname -m)}" != x86_64 ]; then + exit 0 +fi + +# Get build utilities +cd "$(dirname "${BASH_SOURCE[0]}")" +source ../build_scripts/build_utils.sh + +set -x + +# Run the kernel vsyscall test command with a fake vsyscall page +# address, so that we're guaranteed that the host kernel will segfault +# on the attempted vsyscalls. +curl -sSLO https://github.com/torvalds/linux/raw/v4.15/tools/testing/selftests/x86/test_vsyscall.c +check_sha256sum test_vsyscall.c ff55a0c8ae2fc03a248a7fa1c47ba00bfe73abcef09606b6708e01f246a4f2b5 +echo 'fffffffffe600000-fffffffffe601000 --xp 00000000 00:00 0 [vsyscall]' > maps +sed -i -e 's/0xffffffffff6/0xfffffffffe6/' -e 's|/proc/self/maps|/proc/self/cwd/maps|' test_vsyscall.c +cc -ggdb3 -o test_vsyscall test_vsyscall.c -ldl + +if [ -n "$1" ]; then + docker run -v "$PWD":/vsyscall_emu --rm --entrypoint /vsyscall_emu/vsyscall_trace_test --security-opt=seccomp:unconfined --workdir /vsyscall_emu "$1" ./test_vsyscall +else + ./vsyscall_trace_test ./test_vsyscall +fi + +rm -f test_vsyscall test_vsyscall.c maps diff --git a/docker/vsyscall_emu/vsyscall_trace.c b/docker/vsyscall_emu/vsyscall_trace.c new file mode 100644 index 00000000..72467eaa --- /dev/null +++ b/docker/vsyscall_emu/vsyscall_trace.c @@ -0,0 +1,251 @@ +/* Using ptrace, catch when a process in a process tree is about to + * segfault from an attempted vsyscall, and fix it up to use the vDSO + * instead. + * + * usage: vsyscall_trace -p ... + * vsyscall_trace [args...] + * + * In the first mode, traces a process and all its children, until they + * exit. In the second mode, run and trace a child process -- unless + * vsyscalls are enabled, in which case it will just exec the child + * process directly. Because the second mode waits on child processes (as + * required by the ptrace API), it is usable as init inside a container. + * Whether or not it runs as init, it will block until all descendant + * processes exit. + * + * This program itself uses no vsyscalls, so it can be safely + * dynamically linked against an older glibc. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define debug_printf printf +#else +#define debug_printf(...) 0 +#endif + +/* These are ABI constants: see arch/x86/include/uapi/asm/vsyscall.h + * in the kernel source (probably installed on your system as + * - we don't directly use this header so that this + * program continues to compile when it is removed). They start at + * VSYSCALL_ADDR, and increase by 1024 for each call. */ + +#ifndef VSYSCALL_BASE +#define VSYSCALL_BASE 0xffffffffff600000 +#endif +const unsigned long VSYS_gettimeofday = VSYSCALL_BASE + 0, + VSYS_time = VSYSCALL_BASE + 0x400, + VSYS_getcpu = VSYSCALL_BASE + 0x800; + +/* The vDSO is an area of memory that looks like a normal relocatable + * dynamic library, magically placed in your address space by the + * kernel. While it's mapped at a different address in each process when + * ASLR is enabled, the relative offsets are the same, since the kernel + * only contains one vDSO. These variables contain the relative offsets + * as found in the current process. */ +unsigned long VDSO_gettimeofday, VDSO_time, VDSO_getcpu; + +/* Look up the vDSO base address for a process in its auxiliary vector. + * See proc(5) and getauxval(3). If we can ptrace the process, we should + * have permissions to do this. */ +unsigned long vdso_address(pid_t pid) { + char *filename; + asprintf(&filename, "/proc/%d/auxv", pid); + int fd = open(filename, O_RDONLY); + if (fd == -1) { + return 0; + } + unsigned long buf[128]; + int i; + if (read(fd, buf, sizeof(buf)) == -1) { + close(fd); + return 0; + } + close(fd); + free(filename); + + for (i = 0; i < 128; i += 2) { + if (buf[i] == AT_SYSINFO_EHDR) { + return buf[i+1]; + } else if (buf[i] == 0) { + return 0; + } + } +} + +/* If the ptraced process segfaulted because it tried to call one of the + * three vsyscalls, redirect its instruction pointer to the + * corresponding vDSO address. The calling conventions are the same, so + * we don't need to change / inspect arguments or do any other safety + * checks - the process could have gotten here on its own. */ +int handle_vsyscall(pid_t pid) { + struct user_regs_struct regs; + ptrace(PTRACE_GETREGS, pid, 0, ®s); + debug_printf("got a segfault at %p\n", regs.rip); + if ((regs.rip & 0xfffffffffffff0ff) == VSYSCALL_BASE) { + debug_printf("handling vsyscall for %d\n", pid); + unsigned long vdso = vdso_address(pid); + if (vdso_address == 0) { + debug_printf("couldn't find vdso\n"); + return 0; + } + debug_printf("vdso address is %p\n", vdso); + + if (regs.rip == VSYS_gettimeofday) { + regs.rip = vdso + VDSO_gettimeofday; + } else if (regs.rip == VSYS_time) { + regs.rip = vdso + VDSO_time; + } else if (regs.rip == VSYS_getcpu) { + regs.rip = vdso + VDSO_getcpu; + } else { + debug_printf("invalid vsyscall %x\n", regs.rip); + return 0; + } + debug_printf("fixing up rip to %p\n", regs.rip); + ptrace(PTRACE_SETREGS, pid, 0, ®s); + return 1; + } + return 0; +} + +int main(int argc, char *argv[]) { + pid_t pid, child_pid = 0; + int wstatus, child_wstatus = 0; + + if (argc < 2) { + printf("usage: vsyscall_trace -p ...\n"); + printf(" vsyscall_trace [args...]\n"); + return 1; + } + + /* Seize all the processes via ptrace. We don't need to track + * them, we only need to call wait(), and the options we're + * passing to PTRACE_SEIZE will cause us to silently pick up + * child processes too. */ + if (strcmp(argv[1], "-p") == 0) { + int i; + for (i = 2; i < argc; i++) { + pid = atoi(argv[i]); + if (ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) { + perror("PTRACE_SEIZE"); + return 1; + } + } + } else { + /* Test to see if vsyscalls work on this machine. If so, + * we don't need to do anything - exec the given command + * so we get entirely out of the way and don't risk + * breaking the process. */ + child_pid = fork(); + if (child_pid == -1) { + perror("fork"); + return 1; + } else if (child_pid == 0) { + ((time_t (*)(time_t *))VSYS_time)(NULL); + return 0; + } else { + waitpid(child_pid, &wstatus, 0); + /* If the child process segfaulted, it will show + * up as WIFSIGNALED instead of WIFEXITED. */ + if (WIFEXITED(wstatus)) { + execvp(argv[1], &argv[1]); + perror("execvp"); + return 1; + } + } + + /* Actually start the child process. */ + child_pid = fork(); + if (child_pid == -1) { + perror("fork"); + return 1; + } else if (child_pid == 0) { + /* Allow the parent process to run PTRACE_SEIZE + * before continuing. */ + raise(SIGSTOP); + execvp(argv[1], &argv[1]); + perror("execvp"); + return 1; + } else { + if (ptrace(PTRACE_SEIZE, child_pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) { + if (errno == EPERM) { + fprintf(stderr, "Error: no kernel vsyscall support and ptrace is disabled.\n"); + fprintf(stderr, "Your kernel does not provide vsyscall emulation, and we cannot\n"); + fprintf(stderr, "work around this because ptrace is prohibited inside this container.\n"); + fprintf(stderr, "Either permit ptrace for this container (e.g., for Docker, use\n"); + fprintf(stderr, "docker run --security-opt=seccomp:unconfined) or boot your kernel\n"); + fprintf(stderr, "with vsyscall=emulate.\n"); + } else { + perror("PTRACE_SEIZE"); + } + kill(child_pid, SIGKILL); + return 1; + } + + fprintf(stderr, "Warning: using ptrace-based vsyscall emulation.\n"); + fprintf(stderr, "This container contains old binaries which require the use of the legacy\n"); + fprintf(stderr, "'vsyscall' feature of the Linux kernel, and your kernel does not provide\n"); + fprintf(stderr, "vsyscall emulation. We will attempt to emulate vsyscalls ourselves using\n"); + fprintf(stderr, "ptrace, but performance may suffer and other tools that use ptrace (e.g.,\n"); + fprintf(stderr, "gdb and strace) will not work.\n"); + fprintf(stderr, "To avoid this emulation, please boot your kernel with vsyscall=emulate.\n"); + kill(child_pid, SIGCONT); + } + } + + /* The vDSO shows up as an object in our address space naemd + * "linux-vdso.so.1" that's already been loaded. */ + void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_NOLOAD); + unsigned long my_vdso_base = vdso_address(getpid()); + VDSO_gettimeofday = (unsigned long)dlsym(vdso, "__vdso_gettimeofday") - my_vdso_base; + VDSO_time = (unsigned long)dlsym(vdso, "__vdso_time") - my_vdso_base; + VDSO_getcpu = (unsigned long)dlsym(vdso, "__vdso_getcpu") - my_vdso_base; + + while ((pid = waitpid(-1, &wstatus, 0)) != -1) { + if (WIFSTOPPED(wstatus)) { + if (WSTOPSIG(wstatus) == SIGSEGV && handle_vsyscall(pid)) { + /* The last argument to PTRACE_CONT is + * the signal to send - passing 0 means + * to suppress the signal. */ + ptrace(PTRACE_CONT, pid, 0, 0); + } else { + ptrace(PTRACE_CONT, pid, 0, WSTOPSIG(wstatus)); + } + } else if (pid == child_pid && WIFEXITED(wstatus)) { + /* Save this exit status so we can use it as our + * own exit status. But don't exit yet if there + * are further descendant processes still + * running. */ + child_wstatus = wstatus; + } + } + if (errno != ECHILD) { + perror("waitpid"); + return 1; + } + if (WIFSIGNALED(wstatus)) { + /* Send ourselves the same signal that killed the child + * process, so our own parent process reports the right + * exit status. */ + raise(WTERMSIG(wstatus)); + /* In case that signal is not fatal, return nonzero. */ + return 1; + } else { + return WEXITSTATUS(wstatus); + } +}