From 1d02e12ad1ccd37f51733bb09326a4aa95db45f9 Mon Sep 17 00:00:00 2001 From: Kostas Kyrimis Date: Fri, 20 Oct 2023 10:50:55 +0300 Subject: [PATCH] chore: call debug stacktrace on SIGUSR1 (#2012) * add macro to install a signal handler that prints the contents of debug stacktrace on SIGUSR1 * add this on regTests --- .github/workflows/regression-tests.yml | 2 +- src/server/CMakeLists.txt | 9 +++++++-- src/server/main_service.cc | 13 +++++++++++++ tests/dragonfly/instance.py | 8 ++++++++ 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression-tests.yml b/.github/workflows/regression-tests.yml index 2986bfe09ee7..3393d9d0659f 100644 --- a/.github/workflows/regression-tests.yml +++ b/.github/workflows/regression-tests.yml @@ -28,7 +28,7 @@ jobs: - name: Configure & Build run: | cmake -B ${GITHUB_WORKSPACE}/build -DCMAKE_BUILD_TYPE=${{matrix.build-type}} -GNinja \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DPRINT_STACKTRACES_ON_SIGNAL=ON cd ${GITHUB_WORKSPACE}/build && ninja dragonfly pwd diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt index 3cfd07105cc6..9f9a1caaa256 100644 --- a/src/server/CMakeLists.txt +++ b/src/server/CMakeLists.txt @@ -21,8 +21,6 @@ add_library(dfly_transaction db_slice.cc malloc_stats.cc blocking_controller.cc serializer_commons.cc journal/serializer.cc journal/executor.cc journal/streamer.cc ${TX_LINUX_SRCS} acl/acl_log.cc slowlog.cc ) -cxx_link(dfly_transaction dfly_core strings_lib TRDP::fast_float) - if (NOT APPLE) SET(SEARCH_FILES search/search_family.cc search/doc_index.cc search/doc_accessors.cc) @@ -44,6 +42,13 @@ add_library(dragonfly_lib engine_shard_set.cc channel_store.cc command_registry. cluster/cluster_family.cc acl/user.cc acl/user_registry.cc acl/acl_family.cc acl/validator.cc acl/helpers.cc) +cxx_link(dfly_transaction dfly_core strings_lib TRDP::fast_float) + +option(PRINT_STACKTRACES_ON_SIGNAL "Enables DF to print all fiber stacktraces on SIGUSR1" OFF) + +if (PRINT_STACKTRACES_ON_SIGNAL) + target_compile_definitions(dragonfly_lib PRIVATE PRINT_STACKTRACES_ON_SIGNAL) +endif() find_library(ZSTD_LIB NAMES libzstd.a libzstdstatic.a zstd NAMES_PER_DIR REQUIRED) diff --git a/src/server/main_service.cc b/src/server/main_service.cc index ee7baa329e19..0ed0eb094141 100644 --- a/src/server/main_service.cc +++ b/src/server/main_service.cc @@ -19,6 +19,7 @@ extern "C" { #include #include +#include #include #include "base/flags.h" @@ -647,6 +648,18 @@ Service::Service(ProactorPool* pp) exit(1); } +#ifdef PRINT_STACKTRACES_ON_SIGNAL + LOG(INFO) << "PRINT STACKTRACES REGISTERED"; + pp_.GetNextProactor()->RegisterSignal({SIGUSR1}, [this](int signal) { + LOG(INFO) << "Received " << strsignal(signal); + util::fb2::Mutex m; + pp_.AwaitFiberOnAll([&m](unsigned index, util::ProactorBase* base) { + std::unique_lock lk(m); + util::fb2::detail::FiberInterface::PrintAllFiberStackTraces(); + }); + }); +#endif + shard_set = new EngineShardSet(pp); // We support less than 1024 threads and we support less than 1024 shards. diff --git a/tests/dragonfly/instance.py b/tests/dragonfly/instance.py index 22612581f07e..b6ce53028d16 100644 --- a/tests/dragonfly/instance.py +++ b/tests/dragonfly/instance.py @@ -160,6 +160,14 @@ def stop(self, kill=False): proc.terminate() proc.communicate(timeout=15) except subprocess.TimeoutExpired: + # We need to send SIGUSR1 to DF such that it prints the stacktrace + proc.send_signal(signal.SIGUSR1) + # Then we sleep for 5 seconds such that DF has enough time to print the stacktraces + # We can't really synchronize here because SIGTERM and SIGKILL do not block even if + # sigaction explicitly blocks other incoming signals until it handles SIGUSR1. + # Even worse, on SIGTERM and SIGKILL none of the handlers registered via sigaction + # are guranteed to run + time.sleep(5) logging.debug(f"Unable to kill the process on port {self._port}") logging.debug(f"INFO LOGS of DF are:") self.print_info_logs_to_debug_log()