From 2d6090315e2dc2c626b624ad2f34ce43a8233b9a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 14 Jul 2020 14:54:43 -0700 Subject: [PATCH 1/8] libcontainer/logs: use int for Config.LogPipeFd It does not make sense to have a string for a numeric type. Signed-off-by: Kir Kolyshkin --- init.go | 8 +++++++- libcontainer/logs/logs.go | 13 +++++-------- main.go | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/init.go b/init.go index 08351fdb488..59ce1e8c739 100644 --- a/init.go +++ b/init.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "runtime" + "strconv" "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/logs" @@ -23,8 +24,13 @@ func init() { panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err)) } + logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE") + logPipeFd, err := strconv.Atoi(logPipeFdStr) + if err != nil { + panic(fmt.Sprintf("libcontainer: failed to convert environment variable _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err)) + } err = logs.ConfigureLogging(logs.Config{ - LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"), + LogPipeFd: logPipeFd, LogFormat: "json", LogLevel: logLevel, }) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 1077e7b0145..a49a074eeae 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -6,7 +6,6 @@ import ( "fmt" "io" "os" - "strconv" "sync" "github.com/sirupsen/logrus" @@ -23,7 +22,7 @@ type Config struct { LogLevel logrus.Level LogFormat string LogFilePath string - LogPipeFd string + LogPipeFd int } func ForwardLogs(logPipe io.Reader) { @@ -74,12 +73,10 @@ func ConfigureLogging(config Config) error { logrus.SetLevel(config.LogLevel) - if config.LogPipeFd != "" { - logPipeFdInt, err := strconv.Atoi(config.LogPipeFd) - if err != nil { - return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err) - } - logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe")) + // XXX: while 0 is a valid fd (usually stdin), here we assume + // that we never deliberately set LogPipeFd to 0. + if config.LogPipeFd > 0 { + logrus.SetOutput(os.NewFile(uintptr(config.LogPipeFd), "logpipe")) } else if config.LogFilePath != "" { f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644) if err != nil { diff --git a/main.go b/main.go index 57d7f4282e6..9f8beb16f0d 100644 --- a/main.go +++ b/main.go @@ -176,9 +176,9 @@ func (f *FatalWriter) Write(p []byte) (n int, err error) { func createLogConfig(context *cli.Context) logs.Config { logFilePath := context.GlobalString("log") - logPipeFd := "" + logPipeFd := 0 if logFilePath == "" { - logPipeFd = "2" + logPipeFd = 2 } config := logs.Config{ LogPipeFd: logPipeFd, From f637a5c2c2613027bad5dcc786657993824ae4fa Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 24 Jun 2020 20:06:19 -0700 Subject: [PATCH 2/8] tests/int/debug.bats: fixups 1. Remove redundant "echo $output" from the first test case, as "runc" helper function already logs the output. 2. Show the contents of log.out to stderr, so it case of error we can see what is going on. 3. Remove the check that `log.out` file exists. This check is redundant, since right after it we do `cat log.out` and check its exit code. 4. Factor out common checks into check_debug. Signed-off-by: Kir Kolyshkin --- tests/integration/debug.bats | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats index 16f1b2fb0cc..cb7fe505c65 100644 --- a/tests/integration/debug.bats +++ b/tests/integration/debug.bats @@ -10,16 +10,19 @@ function teardown() { teardown_bundle } +function check_debug() { + [[ "$*" == *"nsexec started"* ]] + [[ "$*" == *"child process in init()"* ]] +} + @test "global --debug" { # run hello-world runc --debug run test_hello - echo "${output}" [ "$status" -eq 0 ] # check expected debug output was sent to stderr [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + check_debug "$output" } @test "global --debug to --log" { @@ -30,14 +33,11 @@ function teardown() { # check output does not include debug info [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - + cat log.out >&2 # check expected debug output was sent to log.out output=$(cat log.out) [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + check_debug "$output" } @test "global --debug to --log --log-format 'text'" { @@ -48,14 +48,11 @@ function teardown() { # check output does not include debug info [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - + cat log.out >&2 # check expected debug output was sent to log.out output=$(cat log.out) [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + check_debug "$output" } @test "global --debug to --log --log-format 'json'" { @@ -66,12 +63,9 @@ function teardown() { # check output does not include debug info [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - + cat log.out >&2 # check expected debug output was sent to log.out output=$(cat log.out) [[ "${output}" == *'"level":"debug"'* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + check_debug "$output" } From 2223c1ba04656b0026cd4407903c5e3ee4433aae Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 14 Jul 2020 15:33:13 -0700 Subject: [PATCH 3/8] libct/logs.ForwardLogs: use bufio.Scanner Error handling is slightly cleaner this way. While at it, do minor refactoring and fix error logging in processEntry. Signed-off-by: Kir Kolyshkin --- libcontainer/logs/logs.go | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index a49a074eeae..3a45f1a5f78 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -26,37 +26,34 @@ type Config struct { } func ForwardLogs(logPipe io.Reader) { - lineReader := bufio.NewReader(logPipe) - for { - line, err := lineReader.ReadBytes('\n') - if len(line) > 0 { - processEntry(line) - } - if err == io.EOF { - logrus.Debugf("log pipe has been closed: %+v", err) - return - } - if err != nil { - logrus.Errorf("log pipe read error: %+v", err) - } + s := bufio.NewScanner(logPipe) + for s.Scan() { + processEntry(s.Bytes()) + } + if err := s.Err(); err != nil { + logrus.Errorf("log pipe read error: %+v", err) + } else { + logrus.Debugf("log pipe closed") } } func processEntry(text []byte) { - type jsonLog struct { + if len(text) == 0 { + return + } + + var jl struct { Level string `json:"level"` Msg string `json:"msg"` } - - var jl jsonLog if err := json.Unmarshal(text, &jl); err != nil { - logrus.Errorf("failed to decode %q to json: %+v", text, err) + logrus.Errorf("failed to decode %q to json: %v", text, err) return } lvl, err := logrus.ParseLevel(jl.Level) if err != nil { - logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err) + logrus.Errorf("failed to parse log level %q: %v", jl.Level, err) return } logrus.StandardLogger().Logf(lvl, jl.Msg) From 6e39153d2f13d6fc8b6d8edbd10e229ed86fd146 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 5 Mar 2021 20:50:59 -0800 Subject: [PATCH 4/8] main: cast Chmod argument to os.FileMode This fixes a big red warning in my vim. Signed-off-by: Kir Kolyshkin --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index 9f8beb16f0d..5efecb9a739 100644 --- a/main.go +++ b/main.go @@ -142,7 +142,7 @@ func main() { fmt.Fprintln(os.Stderr, "the path in $XDG_RUNTIME_DIR must be writable by the user") fatal(err) } - if err := os.Chmod(root, 0700|os.ModeSticky); err != nil { + if err := os.Chmod(root, os.FileMode(0o700)|os.ModeSticky); err != nil { fmt.Fprintln(os.Stderr, "you should check permission of the path in $XDG_RUNTIME_DIR") fatal(err) } From 7577f2f3494317f0acda9622e958931a53d5177f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 5 Mar 2021 20:53:47 -0800 Subject: [PATCH 5/8] runc init: fix double call to ConfigureLogs I have noticed that ConfigureLogs do not return an error in case logging was already configured -- instead it just warns about it. So I went ahead and changed the warning to the actual error... ... only to discover I broke things badly, because in case of runc init logging is configured twice. The fix is to not configure logging in case we are init. Signed-off-by: Kir Kolyshkin --- libcontainer/logs/logs.go | 4 ++-- main.go | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 3a45f1a5f78..f75084161a5 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -8,6 +8,7 @@ import ( "os" "sync" + "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -64,8 +65,7 @@ func ConfigureLogging(config Config) error { defer configureMutex.Unlock() if loggingConfigured { - logrus.Debug("logging has already been configured") - return nil + return errors.New("logging has already been configured") } logrus.SetLevel(config.LogLevel) diff --git a/main.go b/main.go index 5efecb9a739..97bce5bf5a3 100644 --- a/main.go +++ b/main.go @@ -150,6 +150,10 @@ func main() { if err := reviseRootDir(context); err != nil { return err } + // let init configure logging on its own + if args := context.Args(); args != nil && args.First() == "init" { + return nil + } return logs.ConfigureLogging(createLogConfig(context)) } From 3a14578fd00458be128e74a60ac2a3c8cd902a7b Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 5 Mar 2021 21:00:24 -0800 Subject: [PATCH 6/8] runc run/start/exec: fix init log forwarding race Sometimes debug.bats test cases are failing like this: > not ok 27 global --debug to --log --log-format 'json' > # (in test file tests/integration/debug.bats, line 77) > # `[[ "${output}" == *"child process in init()"* ]]' failed It happens more when writing to disk. This issue is caused by the fact that runc spawns log forwarding goroutine (ForwardLogs) but does not wait for it to finish, resulting in missing debug lines from nsexec. ForwardLogs itself, though, never finishes, because it reads from a reading side of a pipe which writing side is not closed. This is especially true in case of runc create, which spawns runc init and exits; meanwhile runc init waits on exec fifo for some undetermined long time before doing execve. So, to fix the failure described above, we need to: 1. Make runc create/run/exec wait for ForwardLogs to finish; 2. Make runc init close its log pipe file descriptor (i.e. the one which value is passed in _LIBCONTAINER_LOGPIPE environment variable). This is exactly what this commit does. In addition, it - adds logrus debug to late stages of runc init, and amends the integration tests to check for those messages; - adds runc --debug exec test cases, similar to those in debug.bats but for runc exec eather than runc run; - refactors libcotainer/logs unit tests to simplify and update for the new ForwardLogs. PS I have to admit I still do not understand why closing log pipe fd is required in e.g. (*linuxSetnsInit).Init, right before the execve which (thanks to CLOEXEC) closes the fd anyway. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 18 ++- libcontainer/container_linux_test.go | 3 +- libcontainer/factory_linux.go | 8 +- libcontainer/init_linux.go | 4 +- libcontainer/logs/logs.go | 26 ++-- libcontainer/logs/logs_linux_test.go | 175 ++++++++++++++------------- libcontainer/process_linux.go | 10 +- libcontainer/restored_process.go | 6 +- libcontainer/setns_init_linux.go | 10 ++ libcontainer/standard_init_linux.go | 14 +++ tests/integration/debug.bats | 1 + tests/integration/exec.bats | 34 +++++- 12 files changed, 201 insertions(+), 108 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 65e2eace8c8..ac4beb63c5a 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -357,12 +357,26 @@ type openResult struct { err error } -func (c *linuxContainer) start(process *Process) error { +func (c *linuxContainer) start(process *Process) (retErr error) { parent, err := c.newParentProcess(process) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } - parent.forwardChildLogs() + + logsDone := parent.forwardChildLogs() + if logsDone != nil { + defer func() { + // Wait for log forwarder to finish. This depends + // on init closing the _LIBCONTAINER_LOGPIPE log fd: + // For standard init, this is done by closeLogFD. + // For setns init, the fd is closed upon execve. + err := <-logsDone + if err != nil && retErr == nil { + retErr = newSystemErrorWithCause(err, "forwarding init logs") + } + }() + } + if err := parent.start(); err != nil { return newSystemErrorWithCause(err, "starting container process") } diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 2a9af6ad28e..78071a92651 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -139,7 +139,8 @@ func (m *mockProcess) externalDescriptors() []string { func (m *mockProcess) setExternalDescriptors(newFds []string) { } -func (m *mockProcess) forwardChildLogs() { +func (m *mockProcess) forwardChildLogs() chan error { + return nil } func TestGetContainerPids(t *testing.T) { diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 0b4cf59eb19..972b0f2b5d9 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -365,6 +365,12 @@ func (l *LinuxFactory) StartInitialization() (err error) { defer consoleSocket.Close() } + logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE") + logPipeFd, err := strconv.Atoi(logPipeFdStr) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err) + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() @@ -387,7 +393,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { } }() - i, err := newContainerInit(it, pipe, consoleSocket, fifofd) + i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd) if err != nil { return err } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index c57af0eebb8..00f28e1f7eb 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -76,7 +76,7 @@ type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -90,6 +90,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd pipe: pipe, consoleSocket: consoleSocket, config: config, + logFd: logFd, }, nil case initStandard: return &linuxStandardInit{ @@ -98,6 +99,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd parentPid: unix.Getppid(), config: config, fifoFd: fifoFd, + logFd: logFd, }, nil } return nil, fmt.Errorf("unknown init type %q", t) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index f75084161a5..dd4c4a6a101 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -13,7 +13,7 @@ import ( ) var ( - configureMutex = sync.Mutex{} + configureMutex sync.Mutex // loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`. // Subsequent invocations of `ConfigureLogging` would be no-op loggingConfigured = false @@ -26,16 +26,22 @@ type Config struct { LogPipeFd int } -func ForwardLogs(logPipe io.Reader) { +func ForwardLogs(logPipe io.ReadCloser) chan error { + done := make(chan error, 1) s := bufio.NewScanner(logPipe) - for s.Scan() { - processEntry(s.Bytes()) - } - if err := s.Err(); err != nil { - logrus.Errorf("log pipe read error: %+v", err) - } else { - logrus.Debugf("log pipe closed") - } + + go func() { + for s.Scan() { + processEntry(s.Bytes()) + } + if err := logPipe.Close(); err != nil { + logrus.Errorf("error closing log source: %v", err) + } + done <- s.Err() + close(done) + }() + + return done } func processEntry(text []byte) { diff --git a/libcontainer/logs/logs_linux_test.go b/libcontainer/logs/logs_linux_test.go index ec9ae4fc8b2..0baf7c4cbc0 100644 --- a/libcontainer/logs/logs_linux_test.go +++ b/libcontainer/logs/logs_linux_test.go @@ -1,10 +1,10 @@ package logs import ( - "errors" + "bytes" + "io" "io/ioutil" "os" - "strings" "testing" "time" @@ -12,141 +12,109 @@ import ( ) func TestLoggingToFile(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`) - - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "kitten") { - t.Fatalf("%s does not contain kitten", logFileContent) - } + logToLogWriter(t, l, `{"level": "info","msg":"kitten"}`) + finish(t, l) + check(t, l, "kitten") } func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() - - logToLogWriter(t, logW, "invalid-json-with-kitten") + l := runLogForwarding(t) - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "failed to decode") { - t.Fatalf("%q does not contain decoding error", logFileContent) - } - - truncateLogFile(t, logFile) + logToLogWriter(t, l, "invalid-json-with-kitten") + checkWait(t, l, "failed to decode") - logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + truncateLogFile(t, l.file) - logFileContent = waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "puppy") { - t.Fatalf("%s does not contain puppy", logFileContent) - } + logToLogWriter(t, l, `{"level": "info","msg":"puppy"}`) + finish(t, l) + check(t, l, "puppy") } func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`) + logToLogWriter(t, l, `{"level": "alert","msg":"puppy"}`) + checkWait(t, l, "failed to parse log level") - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "failed to parse log level") { - t.Fatalf("%q does not contain log level parsing error", logFileContent) - } - - truncateLogFile(t, logFile) - - logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + truncateLogFile(t, l.file) - logFileContent = waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "puppy") { - t.Fatalf("%s does not contain puppy", logFileContent) - } + logToLogWriter(t, l, `{"level": "info","msg":"puppy"}`) + finish(t, l) + check(t, l, "puppy") } func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) { - logW, logFile, doneForwarding := runLogForwarding(t) - defer os.Remove(logFile) + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`) + logToLogWriter(t, l, `{"level": "info","msg":"sync"}`) - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(logFileContent, "sync") { - t.Fatalf("%q does not contain sync message", logFileContent) - } - - logW.Close() + // Do not use finish() here as we check done pipe ourselves. + l.w.Close() select { - case <-doneForwarding: + case <-l.done: case <-time.After(10 * time.Second): t.Fatal("log forwarding did not stop after closing the pipe") } + + check(t, l, "sync") } -func logToLogWriter(t *testing.T, logW *os.File, message string) { - _, err := logW.Write([]byte(message + "\n")) +func logToLogWriter(t *testing.T, l *log, message string) { + t.Helper() + _, err := l.w.Write([]byte(message + "\n")) if err != nil { t.Fatalf("failed to write %q to log writer: %v", message, err) } } -func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) { +type log struct { + w io.WriteCloser + file string + done chan error +} + +func runLogForwarding(t *testing.T) *log { + t.Helper() logR, logW, err := os.Pipe() if err != nil { t.Fatal(err) } + t.Cleanup(func() { + logR.Close() + logW.Close() + }) tempFile, err := ioutil.TempFile("", "") if err != nil { t.Fatal(err) } + defer tempFile.Close() logFile := tempFile.Name() + t.Cleanup(func() { os.Remove(logFile) }) logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile} - return logW, logFile, startLogForwarding(t, logConfig, logR) -} - -func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} { loggingConfigured = false if err := ConfigureLogging(logConfig); err != nil { t.Fatal(err) } - doneForwarding := make(chan struct{}) - go func() { - ForwardLogs(logR) - close(doneForwarding) - }() - return doneForwarding -} -func waitForLogContent(t *testing.T, logFile string) string { - startTime := time.Now() + done := ForwardLogs(logR) - for { - if time.Now().After(startTime.Add(10 * time.Second)) { - t.Fatal(errors.New("No content in log file after 10 seconds")) - break - } + return &log{w: logW, done: done, file: logFile} +} - fileContent, err := ioutil.ReadFile(logFile) - if err != nil { - t.Fatal(err) - } - if len(fileContent) == 0 { - continue - } - return string(fileContent) +func finish(t *testing.T, l *log) { + t.Helper() + l.w.Close() + if err := <-l.done; err != nil { + t.Fatalf("ForwardLogs: %v", err) } - - return "" } func truncateLogFile(t *testing.T, logFile string) { - file, err := os.OpenFile(logFile, os.O_RDWR, 0666) + file, err := os.OpenFile(logFile, os.O_RDWR, 0o600) if err != nil { t.Fatalf("failed to open log file: %v", err) return @@ -158,3 +126,40 @@ func truncateLogFile(t *testing.T, logFile string) { t.Fatalf("failed to truncate log file: %v", err) } } + +// check checks that file contains txt +func check(t *testing.T, l *log, txt string) { + t.Helper() + contents, err := ioutil.ReadFile(l.file) + if err != nil { + t.Fatal(err) + } + if !bytes.Contains(contents, []byte(txt)) { + t.Fatalf("%q does not contain %q", string(contents), txt) + } +} + +// checkWait checks that file contains txt. If the file is empty, +// it waits until it's not. +func checkWait(t *testing.T, l *log, txt string) { + t.Helper() + const ( + delay = 100 * time.Millisecond + iter = 3 + ) + for i := 0; ; i++ { + st, err := os.Stat(l.file) + if err != nil { + t.Fatal(err) + } + if st.Size() > 0 { + break + } + if i == iter { + t.Fatalf("waited %s for file %s to be non-empty but it still is", iter*delay, l.file) + } + time.Sleep(delay) + } + + check(t, l, txt) +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 6f2828770ee..8315d709aaa 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -51,7 +51,7 @@ type parentProcess interface { setExternalDescriptors(fds []string) - forwardChildLogs() + forwardChildLogs() chan error } type filePair struct { @@ -251,8 +251,8 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *setnsProcess) forwardChildLogs() { - go logs.ForwardLogs(p.logFilePair.parent) +func (p *setnsProcess) forwardChildLogs() chan error { + return logs.ForwardLogs(p.logFilePair.parent) } type initProcess struct { @@ -605,8 +605,8 @@ func (p *initProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *initProcess) forwardChildLogs() { - go logs.ForwardLogs(p.logFilePair.parent) +func (p *initProcess) forwardChildLogs() chan error { + return logs.ForwardLogs(p.logFilePair.parent) } func getPipeFds(pid int) ([]string, error) { diff --git a/libcontainer/restored_process.go b/libcontainer/restored_process.go index f861e82d1b2..97565d7fa4b 100644 --- a/libcontainer/restored_process.go +++ b/libcontainer/restored_process.go @@ -77,7 +77,8 @@ func (p *restoredProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *restoredProcess) forwardChildLogs() { +func (p *restoredProcess) forwardChildLogs() chan error { + return nil } // nonChildProcess represents a process where the calling process is not @@ -125,5 +126,6 @@ func (p *nonChildProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *nonChildProcess) forwardChildLogs() { +func (p *nonChildProcess) forwardChildLogs() chan error { + return nil } diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 6b1e9a6e97c..9e7fc37dc5a 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -21,6 +22,7 @@ type linuxSetnsInit struct { pipe *os.File consoleSocket *os.File config *initConfig + logFd int } func (l *linuxSetnsInit) getSessionRingName() string { @@ -86,5 +88,13 @@ func (l *linuxSetnsInit) Init() error { return newSystemErrorWithCause(err, "init seccomp") } } + logrus.Debugf("init: about to exec") + // Close the log pipe fd so the parent's ForwardLogs can exit. + // Usually this would be done upon system.Execv below, as the fd + // has CLOEXEC flag set, but it may fail. + if err := unix.Close(l.logFd); err != nil { + return newSystemErrorWithCause(err, "closing log pipe fd") + } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 7ec506c462e..0dee51b624c 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -16,6 +16,7 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -24,6 +25,7 @@ type linuxStandardInit struct { consoleSocket *os.File parentPid int fifoFd int + logFd int config *initConfig } @@ -180,7 +182,19 @@ func (l *linuxStandardInit) Init() error { return err } // Close the pipe to signal that we have completed our init. + logrus.Debugf("init: closing the pipe to signal completion") l.pipe.Close() + + // Close the log pipe fd so the parent's ForwardLogs can exit. + // Usually this would be done upon execve as the file descriptor + // has CLOEXEC flag set, but (1) execve might not happen due to + // any kind of error below; (2) in case of runc start, it spawns + // runc init and exits, while runc init (this code) waits on fifo + // for an indefinitely long time. + if err := unix.Close(l.logFd); err != nil { + return newSystemErrorWithCause(err, "closing log pipe fd") + } + // Wait for the FIFO to be opened on the other side before exec-ing the // user process. We open it through /proc/self/fd/$fd, because the fd that // was given to us was an O_PATH fd to the fifo itself. Linux allows us to diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats index cb7fe505c65..b2f87cf5f59 100644 --- a/tests/integration/debug.bats +++ b/tests/integration/debug.bats @@ -13,6 +13,7 @@ function teardown() { function check_debug() { [[ "$*" == *"nsexec started"* ]] [[ "$*" == *"child process in init()"* ]] + [[ "$*" == *"init: closing the pipe to signal completion"* ]] } @test "global --debug" { diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index b7c9becbf47..779e7480a8e 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -132,6 +132,38 @@ function teardown() { echo hello >preserve-fds.test # fd 3 is used by bats, so we use 4 exec 4&2 + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + check_exec_debug "$output" +} From 4186d209f3e455c9966665229df6bb0f0a09159a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 8 Jun 2020 01:24:30 +1000 Subject: [PATCH 7/8] nsenter: improve debug logging In order to make 'runc --debug' actually useful for debugging nsexec bugs, provide information about all the internal operations when in debug mode. [@kolyshkin: auto-fix formatting using indent for make validate to pass] Signed-off-by: Aleksa Sarai Signed-off-by: Kir Kolyshkin --- libcontainer/init_linux.go | 4 +- libcontainer/nsenter/nsexec.c | 320 +++++++++++++++++++++------------- tests/integration/debug.bats | 2 +- tests/integration/exec.bats | 2 +- 4 files changed, 200 insertions(+), 128 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 00f28e1f7eb..dcebeda11a9 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -35,8 +35,8 @@ const ( ) type pid struct { - Pid int `json:"pid"` - PidFirstChild int `json:"pid_first"` + Pid int `json:"stage2_pid"` + PidFirstChild int `json:"stage1_pid"` } // network is an internal struct used to setup container networks. diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index a33f2fcc371..985b9a44057 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -36,7 +36,7 @@ enum sync_t { SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ }; /* @@ -45,10 +45,14 @@ enum sync_t { */ #define CREATECGROUPNS 0x80 +#define STAGE_SETUP -1 /* longjmp() arguments. */ -#define JUMP_PARENT 0x00 -#define JUMP_CHILD 0xA0 -#define JUMP_INIT 0xA1 +#define STAGE_PARENT 0 +#define STAGE_CHILD 1 +#define STAGE_INIT 2 + +/* Stores the current stage of nsexec. */ +int current_stage = STAGE_SETUP; /* Assume the stack grows down, so arguments should be above it. */ struct clone_t { @@ -56,7 +60,7 @@ struct clone_t { * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__ ((aligned(16))); + char stack[4096] __attribute__((aligned(16))); char stack_ptr[0]; /* There's two children. This is used to execute the different code. */ @@ -134,33 +138,41 @@ int setns(int fd, int nstype) } #endif -static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +static void write_log(const char *level, const char *format, ...) { - char message[1024] = {}; - + char *message = NULL, *stage = NULL; va_list args; + int ret; if (logfd < 0 || level == NULL) - return; + goto out; va_start(args, format); - if (vsnprintf(message, sizeof(message), format, args) < 0) - goto done; - - dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); -done: + ret = vasprintf(&message, format, args); va_end(args); -} + if (ret < 0) + goto out; + + if (current_stage == STAGE_SETUP) + stage = strdup("nsexec"); + else + ret = asprintf(&stage, "nsexec-%d", current_stage); + if (ret < 0) + goto out; -#define write_log(level, fmt, ...) \ - write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message); + + out: + free(message); + free(stage); +} /* XXX: This is ugly. */ static int syncfd = -1; #define bail(fmt, ...) \ do { \ - write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ + write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \ exit(1); \ } while(0) @@ -297,9 +309,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/uid_map", pid); + write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newuid map on %d", pid); } @@ -310,9 +324,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/gid_map", pid); + write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newgid map on %d", pid); } @@ -323,19 +339,20 @@ static void update_oom_score_adj(char *data, size_t len) if (data == NULL || len <= 0) return; + write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); if (write_file(data, len, "/proc/self/oom_score_adj") < 0) bail("failed to update /proc/self/oom_score_adj"); } /* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) __attribute__((noinline)); static int child_func(void *arg) { struct clone_t *ca = (struct clone_t *)arg; longjmp(*ca->env, ca->jmpval); } -static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { @@ -507,7 +524,6 @@ void join_namespaces(char *nslist) char *namespace = strtok_r(nslist, ",", &saveptr); struct namespace_t { int fd; - int ns; char type[PATH_MAX]; char path[PATH_MAX]; } *namespaces = NULL; @@ -542,7 +558,7 @@ void join_namespaces(char *nslist) bail("failed to open %s", path); ns->fd = fd; - ns->ns = nsflag(namespace); + strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); @@ -555,12 +571,14 @@ void join_namespaces(char *nslist) */ for (i = 0; i < num; i++) { - struct namespace_t ns = namespaces[i]; + struct namespace_t *ns = &namespaces[i]; + int flag = nsflag(ns->type); - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); + if (setns(ns->fd, flag) < 0) + bail("failed to setns into %s namespace", ns->type); - close(ns.fd); + close(ns->fd); } free(namespaces); @@ -569,6 +587,14 @@ void join_namespaces(char *nslist) /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); +static inline int sane_kill(pid_t pid, int signum) +{ + if (pid > 0) + return kill(pid, signum); + else + return 0; +} + void nsexec(void) { int pipenum; @@ -598,7 +624,7 @@ void nsexec(void) if (ensure_cloned_binary() < 0) bail("could not ensure we are a cloned binary"); - write_log(DEBUG, "nsexec started"); + write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -622,6 +648,7 @@ void nsexec(void) * containers), which is the recommendation from the kernel folks. */ if (config.namespaces) { + write_log(DEBUG, "set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as non-dumpable"); } @@ -686,45 +713,49 @@ void nsexec(void) * -- Aleksa "what has my life come to?" Sarai */ - switch (setjmp(env)) { + current_stage = setjmp(env); + switch (current_stage) { /* * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and + * (stage 1: STAGE_CHILD) process and write its uid_map and * gid_map. That process will go on to create a new process, then * it will send us its PID which we will send to the bootstrap * process. */ - case JUMP_PARENT:{ + case STAGE_PARENT:{ int len; - pid_t child, first_child = -1; - bool ready = false; + pid_t stage1_pid = -1, stage2_pid = -1; + bool stage1_complete, stage2_complete; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-0"); /* Start the process of getting a container. */ - child = clone_parent(&env, JUMP_CHILD); - if (child < 0) - bail("unable to fork: child_func"); + write_log(DEBUG, "spawn stage-1"); + stage1_pid = clone_parent(&env, STAGE_CHILD); + if (stage1_pid < 0) + bail("unable to spawn stage-1"); - /* - * State machine for synchronisation with the children. - * - * Father only return when both child and grandchild are - * ready, so we can receive all possible error codes - * generated by children. - */ syncfd = sync_child_pipe[1]; close(sync_child_pipe[0]); - while (!ready) { + /* + * State machine for synchronisation with the children. We only + * return once both the child and grandchild are ready. + */ + write_log(DEBUG, "-> stage-1 synchronisation loop"); + stage1_complete = false; + while (!stage1_complete) { enum sync_t s; if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); + bail("failed to sync with stage-1: next state"); switch (s) { case SYNC_USERMAP_PLS: + write_log(DEBUG, "stage-1 requested userns mappings"); + /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're @@ -735,70 +766,78 @@ void nsexec(void) * For rootless multi-entry mapping, config.is_setgroup shall be true and * newuidmap/newgidmap shall be used. */ - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(child, SETGROUPS_DENY); + update_setgroups(stage1_pid, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_RECVPID_PLS:{ - first_child = child; - - /* Get the init_func pid. */ - if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(first_child, SIGKILL); - bail("failed to sync with child: read(childpid)"); - } - - /* Send ACK. */ - s = SYNC_RECVPID_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(first_child, SIGKILL); - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); - } - - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } + case SYNC_RECVPID_PLS: + write_log(DEBUG, "stage-1 requested pid to be forwarded"); + + /* Get the stage-2 pid. */ + if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: read(stage2_pid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); + } + + /* + * Send both the stage-1 and stage-2 pids back to runc. + * runc needs the stage-2 to continue process management, + * but because stage-1 was spawned with CLONE_PARENT we + * cannot reap it within stage-0 and thus we need to ask + * runc to reap the zombie for us. + */ + write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", + stage1_pid, stage2_pid); + len = + dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, + stage2_pid); + if (len < 0) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with runc: write(pid-JSON)"); } break; - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-1 complete"); + stage1_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-1 synchronisation loop"); /* Now sync with grandchild. */ - syncfd = sync_grandchild_pipe[1]; close(sync_grandchild_pipe[0]); - - ready = false; - while (!ready) { + write_log(DEBUG, "-> stage-2 synchronisation loop"); + stage2_complete = false; + while (!stage2_complete) { enum sync_t s; + write_log(DEBUG, "signalling stage-2 to run"); s = SYNC_GRANDCHILD; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with child: write(SYNC_GRANDCHILD)"); } @@ -806,27 +845,31 @@ void nsexec(void) bail("failed to sync with child: next state"); switch (s) { - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-2 complete"); + stage2_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-2 synchronisation loop"); + write_log(DEBUG, "<~ nsexec stage-0"); exit(0); } + break; /* * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). + * provided namespaces in the netlink payload and unshare all of + * the requested namespaces. If we've been asked to CLONE_NEWUSER, + * we will ask our parent (stage 0) to set up our user mappings + * for us. Then, we create a new child (stage 2: STAGE_INIT) for + * PID namespace. We then send the child's PID to our parent + * (stage 0). */ - case JUMP_CHILD:{ - pid_t child; + case STAGE_CHILD:{ + pid_t stage2_pid = -1; enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ @@ -835,11 +878,12 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-1"); /* * We need to setns first. We cannot do this earlier (in stage 0) * because of the fact that we forked to get here (the PID of - * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ if (config.namespaces) @@ -865,40 +909,50 @@ void nsexec(void) * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + write_log(DEBUG, "unshare user namespace"); if (unshare(CLONE_NEWUSER) < 0) bail("failed to unshare user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal our parent to hook us up. + * We need to set ourselves as dumpable temporarily so that the + * parent process can write to our procfs files. */ - - /* Switching is only necessary if we joined namespaces. */ if (config.namespaces) { + write_log(DEBUG, "temporarily set process as dumpable"); if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to temporarily set process as dumpable"); } + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal stage-0 to do the mapping for + * us. + */ + write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); /* ... wait for mapping ... */ - + write_log(DEBUG, "request stage-0 to map user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); - /* Switching is only necessary if we joined namespaces. */ + + /* Revert temporary re-dumpable setting. */ if (config.namespaces) { + write_log(DEBUG, "re-set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to re-set process as non-dumpable"); } /* Become root in the namespace proper. */ if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } + /* * Unshare all of the namespaces. Now, it should be noted that this * ordering might break in the future (especially with rootless @@ -909,8 +963,9 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ + write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); + bail("failed to unshare remaining namespaces (except cgroupns)"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -921,41 +976,45 @@ void nsexec(void) * which would break many applications and libraries, so we must fork * to actually enter the new PID namespace. */ - child = clone_parent(&env, JUMP_INIT); - if (child < 0) - bail("unable to fork: init_func"); + write_log(DEBUG, "spawn stage-2"); + stage2_pid = clone_parent(&env, STAGE_INIT); + if (stage2_pid < 0) + bail("unable to spawn stage-2"); /* Send the child to our parent, which knows what it's doing. */ + write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); s = SYNC_RECVPID_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); } - if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(childpid)"); + if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(stage2_pid)"); } /* ... wait for parent to get the pid ... */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); } if (s != SYNC_RECVPID_ACK) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); } - /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ + write_log(DEBUG, "<~ nsexec stage-1"); exit(0); } + break; /* * Stage 2: We're the final child process, and the only process that will @@ -963,7 +1022,7 @@ void nsexec(void) * final cleanup steps and then return to the Go runtime to allow * init_linux.go to run. */ - case JUMP_INIT:{ + case STAGE_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -978,6 +1037,7 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-2"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); @@ -998,21 +1058,30 @@ void nsexec(void) bail("setgroups failed"); } - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + /* + * Wait until our topmost parent has finished cgroup setup in + * p.manager.Apply(). + * + * TODO(cyphar): Check if this code is actually needed because we + * should be in the cgroup even from stage-0, so + * waiting until now might not make sense. + */ if (config.cloneflags & CLONE_NEWCGROUP) { uint8_t value; if (read(pipenum, &value, sizeof(value)) != sizeof(value)) bail("read synchronisation value failed"); if (value == CREATECGROUPNS) { + write_log(DEBUG, "unshare cgroup namespace"); if (unshare(CLONE_NEWCGROUP) < 0) bail("failed to unshare cgroup namespace"); } else bail("received unknown synchronisation value"); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + bail("failed to sync with patent: write(SYNC_CHILD_FINISH)"); /* Close sync pipes. */ close(sync_grandchild_pipe[0]); @@ -1021,10 +1090,13 @@ void nsexec(void) nl_free(&config); /* Finish executing, let the Go runtime take over. */ + write_log(DEBUG, "<= nsexec container setup"); + write_log(DEBUG, "booting up go runtime ..."); return; } + break; default: - bail("unexpected jump value"); + bail("unknown stage '%d' for jump value", current_stage); } /* Should never be reached. */ diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats index b2f87cf5f59..333745e3969 100644 --- a/tests/integration/debug.bats +++ b/tests/integration/debug.bats @@ -11,7 +11,7 @@ function teardown() { } function check_debug() { - [[ "$*" == *"nsexec started"* ]] + [[ "$*" == *"nsexec container setup"* ]] [[ "$*" == *"child process in init()"* ]] [[ "$*" == *"init: closing the pipe to signal completion"* ]] } diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index 779e7480a8e..3b825e4b768 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -138,7 +138,7 @@ function teardown() { } function check_exec_debug() { - [[ "$*" == *"nsexec started"* ]] + [[ "$*" == *"nsexec container setup"* ]] [[ "$*" == *"child process in init()"* ]] [[ "$*" == *"init: about to exec"* ]] } From fce08a920860d9e4c1577818b58ac1795b1887bc Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 24 Jun 2020 18:00:41 -0700 Subject: [PATCH 8/8] libct/nsenter: add json msg escaping Some strings logged by write_log() contain \n, which leads to errors like this one: > # time="2020-06-07T15:41:37Z" level=error msg="failed to decode \"{\\\"level\\\":\\\"debug\\\", \\\"msg\\\": \\\"nsexec-0[2265]: update /proc/2266/uid_map to '0 1000 1\\n\" to json: invalid character '\\n' in string literal" The fix is to escape such characters. Signed-off-by: Kir Kolyshkin --- libcontainer/nsenter/escape.c | 93 +++++++++++++++++++++++++++++ libcontainer/nsenter/escape.go | 24 ++++++++ libcontainer/nsenter/escape.h | 1 + libcontainer/nsenter/escape_test.go | 23 +++++++ libcontainer/nsenter/nsexec.c | 3 + 5 files changed, 144 insertions(+) create mode 100644 libcontainer/nsenter/escape.c create mode 100644 libcontainer/nsenter/escape.go create mode 100644 libcontainer/nsenter/escape.h create mode 100644 libcontainer/nsenter/escape_test.go diff --git a/libcontainer/nsenter/escape.c b/libcontainer/nsenter/escape.c new file mode 100644 index 00000000000..56f99f12ba0 --- /dev/null +++ b/libcontainer/nsenter/escape.c @@ -0,0 +1,93 @@ +#include + +static char hex(char i) +{ + if (i >= 0 && i < 10) { + return '0' + i; + } + if (i >= 10 && i < 16) { + return 'a' + i - 10; + } + return '?'; +} + +/* + * Escape the string to be usable as JSON string. + */ +char *escape_json_string(char *str) +{ + int i, j = 0; + char *out; + + // Avoid malloc by checking first if escaping is required. + // While at it, count how much additional space we need. + // XXX: the counting code need to be in sync with the rest! + for (i = 0; str[i] != '\0'; i++) { + switch (str[i]) { + case '\\': + case '"': + case '\b': + case '\n': + case '\r': + case '\t': + case '\f': + j += 2; + break; + default: + if (str[i] < ' ') { + // \u00xx + j += 6; + } + } + } + if (j == 0) { + // nothing to escape + return str; + } + + out = malloc(i + j); + if (!out) { + exit(1); + } + for (i = j = 0; str[i] != '\0'; i++, j++) { + switch (str[i]) { + case '"': + case '\\': + out[j++] = '\\'; + out[j] = str[i]; + continue; + } + if (str[i] >= ' ') { + out[j] = str[i]; + continue; + } + out[j++] = '\\'; + switch (str[i]) { + case '\b': + out[j] = 'b'; + break; + case '\n': + out[j] = 'n'; + break; + case '\r': + out[j] = 'r'; + break; + case '\t': + out[j] = 't'; + break; + case '\f': + out[j] = 'f'; + break; + default: + out[j++] = 'u'; + out[j++] = '0'; + out[j++] = '0'; + out[j++] = hex(str[i] >> 4); + out[j] = hex(str[i] & 0x0f); + } + } + out[j] = '\0'; + + free(str); + return out; +} diff --git a/libcontainer/nsenter/escape.go b/libcontainer/nsenter/escape.go new file mode 100644 index 00000000000..c8a9860c7cd --- /dev/null +++ b/libcontainer/nsenter/escape.go @@ -0,0 +1,24 @@ +package nsenter + +// This file is part of escape_json_string unit test. It would be a part +// of escape_test.go if Go would allow cgo to be used in _test.go files. + +// #include +// #include "escape.h" +import "C" + +import ( + "testing" + "unsafe" +) + +func testEscapeJsonString(t *testing.T, input, want string) { + in := C.CString(input) + out := C.escape_json_string(in) + got := C.GoString(out) + C.free(unsafe.Pointer(out)) + t.Logf("input: %q, output: %q", input, got) + if got != want { + t.Errorf("Failed on input: %q, want %q, got %q", input, want, got) + } +} diff --git a/libcontainer/nsenter/escape.h b/libcontainer/nsenter/escape.h new file mode 100644 index 00000000000..80d660a4338 --- /dev/null +++ b/libcontainer/nsenter/escape.h @@ -0,0 +1 @@ +extern char *escape_json_string(char *str); diff --git a/libcontainer/nsenter/escape_test.go b/libcontainer/nsenter/escape_test.go new file mode 100644 index 00000000000..bd28a29146b --- /dev/null +++ b/libcontainer/nsenter/escape_test.go @@ -0,0 +1,23 @@ +package nsenter + +import "testing" + +func TestEscapeJsonString(t *testing.T) { + testCases := []struct { + input, output string + }{ + {"", ""}, + {"abcdef", "abcdef"}, + {`\\\\\\`, `\\\\\\\\\\\\`}, + {`with"quote`, `with\"quote`}, + {"\n\r\b\t\f\\", `\n\r\b\t\f\\`}, + {"\007", "\\u0007"}, + {"\017 \020 \037", "\\u000f \\u0010 \\u001f"}, + {"\033", "\\u001b"}, + {"<->", "<->"}, + } + + for _, tc := range testCases { + testEscapeJsonString(t, tc.input, tc.output) + } +} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 985b9a44057..a67b4f6bbbd 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -28,6 +28,7 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +#include "escape.h" /* Synchronisation values. */ enum sync_t { @@ -153,6 +154,8 @@ static void write_log(const char *level, const char *format, ...) if (ret < 0) goto out; + message = escape_json_string(message); + if (current_stage == STAGE_SETUP) stage = strdup("nsexec"); else