From c0aff781e80202c0d84962367623a0d469d707f8 Mon Sep 17 00:00:00 2001
From: Qiang Huang <h.huangqiang@huawei.com>
Date: Fri, 20 Jan 2017 13:10:02 +0800
Subject: [PATCH] Fix race condition when sync with child and grandchild

Fixes: #1236
Fixes: #1281

Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
---
 libcontainer/nsenter/nsexec.c | 108 +++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 35 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 7d15aeb5ace..47068a8f713 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -33,7 +33,8 @@ enum sync_t {
 	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
 	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
-	SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */
+	SYNC_GRANDCHILD  = 0x44, /* The grandchild is ready to run. */
+	SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
 
 	/* XXX: This doesn't help with segfaults and other such issues. */
 	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
@@ -413,7 +414,7 @@ void nsexec(void)
 {
 	int pipenum;
 	jmp_buf env;
-	int syncpipe[2];
+	int syncChildPipe[2], syncGrandchildPipe[2];
 	struct nlconfig_t config = {0};
 
 	/*
@@ -433,9 +434,16 @@ void nsexec(void)
 	nl_parse(pipenum, &config);
 
 	/* Pipe so we can tell the child when we've finished setting up. */
-	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncChildPipe) < 0)
 		bail("failed to setup sync pipe between parent and child");
 
+	/*
+	 * We need a new socketpair to sync with grandchild so we don't have
+	 * race condition with child.
+	 */
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncGrandchildPipe) < 0)
+		bail("failed to setup sync pipe between parent and grandchild");
+
 	/* TODO: Currently we aren't dealing with child deaths properly. */
 
 	/*
@@ -494,9 +502,10 @@ void nsexec(void)
 	 *          process.
 	 */
 	case JUMP_PARENT: {
-			int len, ready = 0;
+			int len;
 			pid_t child;
 			char buf[JSON_MAX];
+			bool ready = false;
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
@@ -513,26 +522,23 @@ void nsexec(void)
 			 * ready, so we can receive all possible error codes
 			 * generated by children.
 			 */
-			while (ready < 2) {
+			while (!ready) {
 				enum sync_t s;
+				int ret;
 
-				/* This doesn't need to be global, we're in the parent. */
-				int syncfd = syncpipe[1];
+				syncfd = syncChildPipe[1];
+				close(syncChildPipe[0]);
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with child: next state");
 
 				switch (s) {
-				case SYNC_ERR: {
-						/* We have to mirror the error code of the child. */
-						int ret;
-
-						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
-							bail("failed to sync with child: read(error code)");
+				case SYNC_ERR:
+					/* We have to mirror the error code of the child. */
+					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+						bail("failed to sync with child: read(error code)");
 
-						exit(ret);
-					}
-					break;
+					exit(ret);
 				case SYNC_USERMAP_PLS:
 					/* Enable setgroups(2) if we've been asked to. */
 					if (config.is_setgroup)
@@ -548,11 +554,6 @@ void nsexec(void)
 						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
 					}
 					break;
-				case SYNC_USERMAP_ACK:
-					/* We should _never_ receive acks. */
-					kill(child, SIGKILL);
-					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
-					break;
 				case SYNC_RECVPID_PLS: {
 						pid_t old = child;
 
@@ -570,20 +571,46 @@ void nsexec(void)
 							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
 						}
 					}
-
-					ready++;
-					break;
-				case SYNC_RECVPID_ACK:
-					/* We should _never_ receive acks. */
-					kill(child, SIGKILL);
-					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
 					break;
 				case SYNC_CHILD_READY:
-					ready++;
+					ready = true;
 					break;
 				default:
-					bail("unexpected sync value");
+					bail("unexpected sync value: %u", s);
+				}
+			}
+
+			/* Now sync with grandchild. */
+
+			ready = false;
+			while (!ready) {
+				enum sync_t s;
+				int ret;
+
+				syncfd = syncGrandchildPipe[1];
+				close(syncGrandchildPipe[0]);
+
+				s = SYNC_GRANDCHILD;
+				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+					kill(child, SIGKILL);
+					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
+				}
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with child: next state");
+
+				switch (s) {
+				case SYNC_ERR:
+					/* We have to mirror the error code of the child. */
+					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+						bail("failed to sync with child: read(error code)");
+
+					exit(ret);
+				case SYNC_CHILD_READY:
+					ready = true;
 					break;
+				default:
+					bail("unexpected sync value: %u", s);
 				}
 			}
 
@@ -615,7 +642,8 @@ void nsexec(void)
 			enum sync_t s;
 
 			/* We're in a child and thus need to tell the parent if we die. */
-			syncfd = syncpipe[0];
+			syncfd = syncChildPipe[0];
+			close(syncChildPipe[1]);
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
@@ -700,6 +728,10 @@ void nsexec(void)
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 
+			s = SYNC_CHILD_READY;
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+
 			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
 			exit(0);
 		}
@@ -718,11 +750,19 @@ void nsexec(void)
 			enum sync_t s;
 
 			/* We're in a child and thus need to tell the parent if we die. */
-			syncfd = syncpipe[0];
+			syncfd = syncGrandchildPipe[0];
+			close(syncGrandchildPipe[1]);
+			close(syncChildPipe[0]);
+			close(syncChildPipe[1]);
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
 
+			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
+			if (s != SYNC_GRANDCHILD)
+				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
+
 			if (setsid() < 0)
 				bail("setsid failed");
 
@@ -740,8 +780,7 @@ void nsexec(void)
 				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
 
 			/* Close sync pipes. */
-			close(syncpipe[0]);
-			close(syncpipe[1]);
+			close(syncGrandchildPipe[0]);
 
 			/* Free netlink data. */
 			nl_free(&config);
@@ -751,7 +790,6 @@ void nsexec(void)
 		}
 	default:
 		bail("unexpected jump value");
-		break;
 	}
 
 	/* Should never be reached. */