From dfd0ecc746b2dee16b5cace3ba4add5bd89ad54e Mon Sep 17 00:00:00 2001
From: Geoffrey Thomas <geofft@ldpreload.com>
Date: Sat, 10 Feb 2018 04:46:11 -0500
Subject: [PATCH 1/3] Emulate the vsyscall page in userspace in the x86_64
 Docker image

Since some recent distros are shipping with vsyscall=none by default,
the manylinux1 Docker image doesn't work. Fortunately, we can emulate
everything in userspace by catching segmentation faults for the vsyscall
addresses and forcing the program to use the vDSO instead.

Add an entrypoint to the x86_64 Docker image to detect whether this
emulation is required, and if so, catch these segfaults via ptrace and
adjust the instruction pointer. Using the ptrace syscall at all in
recent versions of Docker requires

    docker run --security-opt=seccomp:unconfined

(which an error message will tell you to do if needed).

There is also a mode for the ptrace helper to trace an existing process
and its children. Because `docker build` doesn't support the
`--security-opt` option, this can be useful for building the manylinux1
image, by running this helper on docker-containerd.
---
 .travis.yml                          |   1 +
 docker/Dockerfile-x86_64             |   4 +
 docker/vsyscall_emu/.gitignore       |   1 +
 docker/vsyscall_emu/Makefile         |  13 ++
 docker/vsyscall_emu/vsyscall_trace.c | 242 +++++++++++++++++++++++++++
 5 files changed, 261 insertions(+)
 create mode 100644 docker/vsyscall_emu/.gitignore
 create mode 100644 docker/vsyscall_emu/Makefile
 create mode 100644 docker/vsyscall_emu/vsyscall_trace.c
diff --git a/.travis.yml b/.travis.yml
index 09c865a4..abca6564 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,7 @@ matrix:
     - env: PLATFORM="x86_64"
 
 script:
+  - make -C docker/vsyscall_emu
   - docker build --rm -t quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT -f docker/Dockerfile-$PLATFORM docker/
 
 
diff --git a/docker/Dockerfile-x86_64 b/docker/Dockerfile-x86_64
index abcfcb13..7af4267c 100644
--- a/docker/Dockerfile-x86_64
+++ b/docker/Dockerfile-x86_64
@@ -8,9 +8,13 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
+COPY vsyscall_emu/vsyscall_trace /usr/local/sbin/vsyscall_trace
+
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && rm -r build_scripts
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 
+ENTRYPOINT ["/usr/local/sbin/vsyscall_trace"]
+
 CMD ["/bin/bash"]
diff --git a/docker/vsyscall_emu/.gitignore b/docker/vsyscall_emu/.gitignore
new file mode 100644
index 00000000..b2518a3b
--- /dev/null
+++ b/docker/vsyscall_emu/.gitignore
@@ -0,0 +1 @@
+vsyscall_trace
diff --git a/docker/vsyscall_emu/Makefile b/docker/vsyscall_emu/Makefile
new file mode 100644
index 00000000..38958795
--- /dev/null
+++ b/docker/vsyscall_emu/Makefile
@@ -0,0 +1,13 @@
+ifeq ($(PLATFORM),x86_64)
+  all: vsyscall_trace
+else
+  all:
+endif
+
+vsyscall_trace: vsyscall_trace.c
+	$(CC) -o $@ $< -ldl
+
+clean:
+	$(RM) -f vsyscall_trace
+
+.PHONY: clean
diff --git a/docker/vsyscall_emu/vsyscall_trace.c b/docker/vsyscall_emu/vsyscall_trace.c
new file mode 100644
index 00000000..3f88a261
--- /dev/null
+++ b/docker/vsyscall_emu/vsyscall_trace.c
@@ -0,0 +1,242 @@
+/* Using ptrace, catch when a process in a process tree is about to
+ * segfault from an attempted vsyscall, and fix it up to use the vDSO
+ * instead.
+ *
+ * usage: vsyscall_trace -p <pid>...
+ *        vsyscall_trace <cmd> [args...]
+ *
+ * In the first mode, traces a process and all its children, until they
+ * exit. In the second mode, run and trace a child process -- unless
+ * vsyscalls are enabled, in which case it will just exec the child
+ * process directly. Because the second mode waits on child processes (as
+ * required by the ptrace API), it is usable as init inside a container.
+ * Whether or not it runs as init, it will block until all descendant
+ * processes exit.
+ *
+ * This program itself uses no vsyscalls, so it can be safely
+ * dynamically linked against an older glibc.
+ */
+
+#define _GNU_SOURCE
+#include <sys/auxv.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/user.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef DEBUG
+#define debug_printf printf
+#else
+#define debug_printf(...) 0
+#endif
+
+/* These are ABI constants: see arch/x86/include/uapi/asm/vsyscall.h
+ * in the kernel source (probably installed on your system as
+ * <asm/vsyscall.h>). They start at VSYSCALL_ADDR, and
+ * increase by 1024 for each call. */
+const unsigned long VSYS_gettimeofday = 0xffffffffff600000,
+                    VSYS_time = 0xffffffffff600400,
+                    VSYS_getcpu = 0xffffffffff600800;
+
+/* The vDSO is an area of memory that looks like a normal relocatable
+ * dynamic library, magically placed in your address space by the
+ * kernel. While it's mapped at a different address in each process when
+ * ASLR is enabled, the relative offsets are the same, since the kernel
+ * only contains one vDSO. These variables contain the relative offsets
+ * as found in the current process. */
+unsigned long VDSO_gettimeofday, VDSO_time, VDSO_getcpu;
+
+/* Look up the vDSO base address for a process in its auxiliary vector.
+ * See proc(5) and getauxval(3). If we can ptrace the process, we should
+ * have permissions to do this. */
+unsigned long vdso_address(pid_t pid) {
+	char *filename;
+	asprintf(&filename, "/proc/%d/auxv", pid);
+	int fd = open(filename, O_RDONLY);
+	if (fd == -1) {
+		return 0;
+	}
+	unsigned long buf[128];
+	int i;
+	if (read(fd, buf, sizeof(buf)) == -1) {
+		close(fd);
+		return 0;
+	}
+	close(fd);
+	free(filename);
+
+	for (i = 0; i < 128; i += 2) {
+		if (buf[i] == AT_SYSINFO_EHDR) {
+			return buf[i+1];
+		} else if (buf[i] == 0) {
+			return 0;
+		}
+	}
+}
+
+/* If the ptraced process segfaulted because it tried to call one of the
+ * three vsyscalls, redirect its instruction pointer to the
+ * corresponding vDSO address. The calling conventions are the same, so
+ * we don't need to change / inspect arguments or do any other safety
+ * checks - the process could have gotten here on its own. */
+int handle_vsyscall(pid_t pid) {
+	struct user_regs_struct regs;
+	ptrace(PTRACE_GETREGS, pid, 0, &regs);
+	if ((regs.rip & 0xfffffffffffff0ff) == 0xffffffffff600000) {
+		debug_printf("handling vsyscall for %d\n", pid);
+		unsigned long vdso = vdso_address(pid);
+		if (vdso_address == 0) {
+			debug_printf("couldn't find vdso\n");
+			return 0;
+		}
+
+		if (regs.rip == VSYS_gettimeofday) {
+			regs.rip = vdso | VDSO_gettimeofday;
+		} else if (regs.rip == VSYS_time) {
+			regs.rip = vdso | VDSO_time;
+		} else if (regs.rip == VSYS_getcpu) {
+			regs.rip = vdso | VDSO_getcpu;
+		} else {
+			debug_printf("invalid vsyscall %x\n", regs.rip);
+			return 0;
+		}
+		ptrace(PTRACE_SETREGS, pid, 0, &regs);
+		return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[]) {
+	pid_t pid, child_pid = 0;
+	int wstatus, child_wstatus = 0;
+
+	if (argc < 2) {
+		printf("usage: vsyscall_trace -p <pid>...\n");
+		printf("       vsyscall_trace <cmd> [args...]\n");
+		return 1;
+	}
+
+	/* Seize all the processes via ptrace. We don't need to track
+	 * them, we only need to call wait(), and the options we're
+	 * passing to PTRACE_SEIZE will cause us to silently pick up
+	 * child processes too. */
+	if (strcmp(argv[1], "-p") == 0) {
+		int i;
+		for (i = 2; i < argc; i++) {
+			pid = atoi(argv[i]);
+			if (ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) {
+				perror("PTRACE_SEIZE");
+				return 1;
+			}
+		}
+	} else {
+		/* Test to see if vsyscalls work on this machine. If so,
+		 * we don't need to do anything - exec the given command
+		 * so we get entirely out of the way and don't risk
+		 * breaking the process. */
+		child_pid = fork();
+		if (child_pid == -1) {
+			perror("fork");
+			return 1;
+		} else if (child_pid == 0) {
+			((time_t (*)(time_t *))VSYS_time)(NULL);
+			return 0;
+		} else {
+			waitpid(child_pid, &wstatus, 0);
+			/* If the child process segfaulted, it will show
+			 * up as WIFSIGNALED instead of WIFEXITED. */
+			if (WIFEXITED(wstatus)) {
+				execvp(argv[1], &argv[1]);
+				perror("execvp");
+				return 1;
+			}
+		}
+
+		/* Actually start the child process. */
+		child_pid = fork();
+		if (child_pid == -1) {
+			perror("fork");
+			return 1;
+		} else if (child_pid == 0) {
+			/* Allow the parent process to run PTRACE_SEIZE
+			 * before continuing. */
+			raise(SIGSTOP);
+			execvp(argv[1], &argv[1]);
+			perror("execvp");
+			return 1;
+		} else {
+			if (ptrace(PTRACE_SEIZE, child_pid, 0, PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | PTRACE_O_TRACECLONE) != 0) {
+				if (errno == EPERM) {
+					fprintf(stderr, "Error: no kernel vsyscall support and ptrace is disabled.\n");
+					fprintf(stderr, "Your kernel does not provide vsyscall emulation, and we cannot\n");
+					fprintf(stderr, "work around this because ptrace is prohibited inside this container.\n");
+					fprintf(stderr, "Either permit ptrace for this container (e.g., for Docker, use\n");
+					fprintf(stderr, "docker run --security-opt=seccomp:unconfined) or boot your kernel\n");
+					fprintf(stderr, "with vsyscall=emulate.\n");
+				} else {
+					perror("PTRACE_SEIZE");
+				}
+				kill(child_pid, SIGKILL);
+				return 1;
+			}
+
+			fprintf(stderr, "Warning: using ptrace-based vsyscall emulation.\n");
+			fprintf(stderr, "This container contains old binaries which require the use of the legacy\n");
+			fprintf(stderr, "'vsyscall' feature of the Linux kernel, and your kernel does not provide\n");
+			fprintf(stderr, "vsyscall emulation. We will attempt to emulate vsyscalls ourselves using\n");
+			fprintf(stderr, "ptrace, but performance may suffer and other tools that use ptrace (e.g.,\n");
+			fprintf(stderr, "gdb and strace) will not work.\n");
+			fprintf(stderr, "To avoid this emulation, please boot your kernel with vsyscall=emulate.\n");
+			kill(child_pid, SIGCONT);
+		}
+	}
+
+	/* The vDSO shows up as an object in our address space naemd
+	 * "linux-vdso.so.1" that's already been loaded. */
+	void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_NOLOAD);
+	VDSO_gettimeofday = (unsigned long)dlsym(vdso, "__vdso_gettimeofday") & 0xfff;
+	VDSO_time = (unsigned long)dlsym(vdso, "__vdso_time") & 0xfff;
+	VDSO_getcpu = (unsigned long)dlsym(vdso, "__vdso_getcpu") & 0xfff;
+
+	while ((pid = waitpid(-1, &wstatus, 0)) != -1) {
+		if (WIFSTOPPED(wstatus)) {
+			if (WSTOPSIG(wstatus) == SIGSEGV && handle_vsyscall(pid)) {
+				/* The last argument to PTRACE_CONT is
+				 * the signal to send - passing 0 means
+				 * to suppress the signal. */
+				ptrace(PTRACE_CONT, pid, 0, 0);
+			} else {
+				ptrace(PTRACE_CONT, pid, 0, WSTOPSIG(wstatus));
+			}
+		} else if (pid == child_pid && WIFEXITED(wstatus)) {
+			/* Save this exit status so we can use it as our
+			 * own exit status. But don't exit yet if there
+			 * are further descendant processes still
+			 * running. */
+			child_wstatus = wstatus;
+		}
+	}
+	if (errno != ECHILD) {
+		perror("waitpid");
+		return 1;
+	}
+	if (WIFSIGNALED(wstatus)) {
+		/* Send ourselves the same signal that killed the child
+		 * process, so our own parent process reports the right
+		 * exit status. */
+		raise(WTERMSIG(wstatus));
+		/* In case that signal is not fatal, return nonzero. */
+		return 1;
+	} else {
+		return WEXITSTATUS(wstatus);
+	}
+}

From c2da56a13b0dc167c10e2bbfe517c9ed25ae4afe Mon Sep 17 00:00:00 2001
From: Geoffrey Thomas <geofft@ldpreload.com>
Date: Mon, 26 Feb 2018 20:28:35 -0500
Subject: [PATCH 2/3] vsyscall_trace: Add a test mode using a faked vsyscall
 base address

---
 .travis.yml                          |  1 +
 docker/vsyscall_emu/.gitignore       |  1 +
 docker/vsyscall_emu/Makefile         | 13 ++++++++---
 docker/vsyscall_emu/test.sh          | 33 ++++++++++++++++++++++++++++
 docker/vsyscall_emu/vsyscall_trace.c | 20 ++++++++++++-----
 5 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100755 docker/vsyscall_emu/test.sh

diff --git a/.travis.yml b/.travis.yml
index abca6564..c00b54c8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ matrix:
 script:
   - make -C docker/vsyscall_emu
   - docker build --rm -t quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT -f docker/Dockerfile-$PLATFORM docker/
+  - docker/vsyscall_emu/test.sh quay.io/pypa/manylinux1_$PLATFORM:$TRAVIS_COMMIT
 
 
 deploy:
diff --git a/docker/vsyscall_emu/.gitignore b/docker/vsyscall_emu/.gitignore
index b2518a3b..65edf391 100644
--- a/docker/vsyscall_emu/.gitignore
+++ b/docker/vsyscall_emu/.gitignore
@@ -1 +1,2 @@
 vsyscall_trace
+vsyscall_trace_test
diff --git a/docker/vsyscall_emu/Makefile b/docker/vsyscall_emu/Makefile
index 38958795..5ab29d1c 100644
--- a/docker/vsyscall_emu/Makefile
+++ b/docker/vsyscall_emu/Makefile
@@ -1,13 +1,20 @@
+PLATFORM ?= $(shell uname -m)
+
 ifeq ($(PLATFORM),x86_64)
-  all: vsyscall_trace
+  TARGETS = vsyscall_trace vsyscall_trace_test
 else
-  all:
+  TARGETS =
 endif
 
+all: $(TARGETS)
+
 vsyscall_trace: vsyscall_trace.c
 	$(CC) -o $@ $< -ldl
 
+vsyscall_trace_test: vsyscall_trace.c
+	$(CC) -o $@ $< -ldl -DVSYSCALL_BASE=0xfffffffffe600000 -DDEBUG
+
 clean:
-	$(RM) -f vsyscall_trace
+	$(RM) $(TARGETS)
 
 .PHONY: clean
diff --git a/docker/vsyscall_emu/test.sh b/docker/vsyscall_emu/test.sh
new file mode 100755
index 00000000..78785996
--- /dev/null
+++ b/docker/vsyscall_emu/test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#
+# Test vsyscall_trace, either on a docker image if a name is provided or
+# directly on the host otherwise.
+
+set -e
+
+if [ "${PLATFORM:-$(uname -m)}" != x86_64 ]; then
+    exit 0
+fi
+
+# Get build utilities
+cd "$(dirname "${BASH_SOURCE[0]}")"
+source ../build_scripts/build_utils.sh
+
+set -x
+
+# Run the kernel vsyscall test command with a fake vsyscall page
+# address, so that we're guaranteed that the host kernel will segfault
+# on the attempted vsyscalls.
+curl -sSLO https://github.com/torvalds/linux/raw/v4.15/tools/testing/selftests/x86/test_vsyscall.c
+check_sha256sum test_vsyscall.c ff55a0c8ae2fc03a248a7fa1c47ba00bfe73abcef09606b6708e01f246a4f2b5
+echo 'fffffffffe600000-fffffffffe601000 --xp 00000000 00:00 0                  [vsyscall]' > maps
+sed -i -e 's/0xffffffffff6/0xfffffffffe6/' -e 's|/proc/self/maps|/proc/self/cwd/maps|' test_vsyscall.c
+cc -ggdb3 -o test_vsyscall test_vsyscall.c -ldl
+
+if [ -n "$1" ]; then
+    docker run -v "$PWD":/vsyscall_emu --rm --entrypoint /vsyscall_emu/vsyscall_trace_test --security-opt=seccomp:unconfined --workdir /vsyscall_emu "$1" ./test_vsyscall
+else
+    ./vsyscall_trace_test ./test_vsyscall
+fi
+
+rm -f test_vsyscall test_vsyscall.c maps
diff --git a/docker/vsyscall_emu/vsyscall_trace.c b/docker/vsyscall_emu/vsyscall_trace.c
index 3f88a261..56b53864 100644
--- a/docker/vsyscall_emu/vsyscall_trace.c
+++ b/docker/vsyscall_emu/vsyscall_trace.c
@@ -41,11 +41,16 @@
 
 /* These are ABI constants: see arch/x86/include/uapi/asm/vsyscall.h
  * in the kernel source (probably installed on your system as
- * <asm/vsyscall.h>). They start at VSYSCALL_ADDR, and
- * increase by 1024 for each call. */
-const unsigned long VSYS_gettimeofday = 0xffffffffff600000,
-                    VSYS_time = 0xffffffffff600400,
-                    VSYS_getcpu = 0xffffffffff600800;
+ * <asm/vsyscall.h> - we don't directly use this header so that this
+ * program continues to compile when it is removed). They start at
+ * VSYSCALL_ADDR, and increase by 1024 for each call. */
+
+#ifndef VSYSCALL_BASE
+#define VSYSCALL_BASE 0xffffffffff600000
+#endif
+const unsigned long VSYS_gettimeofday = VSYSCALL_BASE + 0,
+                    VSYS_time = VSYSCALL_BASE + 0x400,
+                    VSYS_getcpu = VSYSCALL_BASE + 0x800;
 
 /* The vDSO is an area of memory that looks like a normal relocatable
  * dynamic library, magically placed in your address space by the
@@ -91,13 +96,15 @@ unsigned long vdso_address(pid_t pid) {
 int handle_vsyscall(pid_t pid) {
 	struct user_regs_struct regs;
 	ptrace(PTRACE_GETREGS, pid, 0, &regs);
-	if ((regs.rip & 0xfffffffffffff0ff) == 0xffffffffff600000) {
+	debug_printf("got a segfault at %p\n", regs.rip);
+	if ((regs.rip & 0xfffffffffffff0ff) == VSYSCALL_BASE) {
 		debug_printf("handling vsyscall for %d\n", pid);
 		unsigned long vdso = vdso_address(pid);
 		if (vdso_address == 0) {
 			debug_printf("couldn't find vdso\n");
 			return 0;
 		}
+		debug_printf("vdso address is %p\n", vdso);
 
 		if (regs.rip == VSYS_gettimeofday) {
 			regs.rip = vdso | VDSO_gettimeofday;
@@ -109,6 +116,7 @@ int handle_vsyscall(pid_t pid) {
 			debug_printf("invalid vsyscall %x\n", regs.rip);
 			return 0;
 		}
+		debug_printf("fixing up rip to %p\n", regs.rip);
 		ptrace(PTRACE_SETREGS, pid, 0, &regs);
 		return 1;
 	}

From cfa86745e1ee26185246fe19f6d1fcc8c1163e6e Mon Sep 17 00:00:00 2001
From: Geoffrey Thomas <geofft@ldpreload.com>
Date: Mon, 26 Feb 2018 20:31:58 -0500
Subject: [PATCH 3/3] vsyscall_emu: Handle the vDSO correctly

It can be more than one page.
---
 docker/vsyscall_emu/vsyscall_trace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docker/vsyscall_emu/vsyscall_trace.c b/docker/vsyscall_emu/vsyscall_trace.c
index 56b53864..72467eaa 100644
--- a/docker/vsyscall_emu/vsyscall_trace.c
+++ b/docker/vsyscall_emu/vsyscall_trace.c
@@ -107,11 +107,11 @@ int handle_vsyscall(pid_t pid) {
 		debug_printf("vdso address is %p\n", vdso);
 
 		if (regs.rip == VSYS_gettimeofday) {
-			regs.rip = vdso | VDSO_gettimeofday;
+			regs.rip = vdso + VDSO_gettimeofday;
 		} else if (regs.rip == VSYS_time) {
-			regs.rip = vdso | VDSO_time;
+			regs.rip = vdso + VDSO_time;
 		} else if (regs.rip == VSYS_getcpu) {
-			regs.rip = vdso | VDSO_getcpu;
+			regs.rip = vdso + VDSO_getcpu;
 		} else {
 			debug_printf("invalid vsyscall %x\n", regs.rip);
 			return 0;
@@ -211,9 +211,10 @@ int main(int argc, char *argv[]) {
 	/* The vDSO shows up as an object in our address space naemd
 	 * "linux-vdso.so.1" that's already been loaded. */
 	void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_NOLOAD);
-	VDSO_gettimeofday = (unsigned long)dlsym(vdso, "__vdso_gettimeofday") & 0xfff;
-	VDSO_time = (unsigned long)dlsym(vdso, "__vdso_time") & 0xfff;
-	VDSO_getcpu = (unsigned long)dlsym(vdso, "__vdso_getcpu") & 0xfff;
+	unsigned long my_vdso_base = vdso_address(getpid());
+	VDSO_gettimeofday = (unsigned long)dlsym(vdso, "__vdso_gettimeofday") - my_vdso_base;
+	VDSO_time = (unsigned long)dlsym(vdso, "__vdso_time") - my_vdso_base;
+	VDSO_getcpu = (unsigned long)dlsym(vdso, "__vdso_getcpu") - my_vdso_base;
 
 	while ((pid = waitpid(-1, &wstatus, 0)) != -1) {
 		if (WIFSTOPPED(wstatus)) {