Skip to content

Commit

Permalink
chore: call debug stacktrace on SIGUSR1 (#2012)
Browse files Browse the repository at this point in the history
* add macro to install a signal handler that prints the contents of debug stacktrace on SIGUSR1
* add this on regTests
  • Loading branch information
kostasrim authored Oct 20, 2023
1 parent 64841ef commit 1d02e12
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/regression-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: Configure & Build
run: |
cmake -B ${GITHUB_WORKSPACE}/build -DCMAKE_BUILD_TYPE=${{matrix.build-type}} -GNinja \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DPRINT_STACKTRACES_ON_SIGNAL=ON
cd ${GITHUB_WORKSPACE}/build && ninja dragonfly
pwd
Expand Down
9 changes: 7 additions & 2 deletions src/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ add_library(dfly_transaction db_slice.cc malloc_stats.cc blocking_controller.cc
serializer_commons.cc journal/serializer.cc journal/executor.cc journal/streamer.cc
${TX_LINUX_SRCS} acl/acl_log.cc slowlog.cc
)
cxx_link(dfly_transaction dfly_core strings_lib TRDP::fast_float)


if (NOT APPLE)
SET(SEARCH_FILES search/search_family.cc search/doc_index.cc search/doc_accessors.cc)
Expand All @@ -44,6 +42,13 @@ add_library(dragonfly_lib engine_shard_set.cc channel_store.cc command_registry.
cluster/cluster_family.cc acl/user.cc acl/user_registry.cc acl/acl_family.cc
acl/validator.cc acl/helpers.cc)

cxx_link(dfly_transaction dfly_core strings_lib TRDP::fast_float)

option(PRINT_STACKTRACES_ON_SIGNAL "Enables DF to print all fiber stacktraces on SIGUSR1" OFF)

if (PRINT_STACKTRACES_ON_SIGNAL)
target_compile_definitions(dragonfly_lib PRIVATE PRINT_STACKTRACES_ON_SIGNAL)
endif()

find_library(ZSTD_LIB NAMES libzstd.a libzstdstatic.a zstd NAMES_PER_DIR REQUIRED)

Expand Down
13 changes: 13 additions & 0 deletions src/server/main_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ extern "C" {
#include <absl/strings/str_format.h>
#include <xxhash.h>

#include <csignal>
#include <filesystem>

#include "base/flags.h"
Expand Down Expand Up @@ -647,6 +648,18 @@ Service::Service(ProactorPool* pp)
exit(1);
}

#ifdef PRINT_STACKTRACES_ON_SIGNAL
LOG(INFO) << "PRINT STACKTRACES REGISTERED";
pp_.GetNextProactor()->RegisterSignal({SIGUSR1}, [this](int signal) {
LOG(INFO) << "Received " << strsignal(signal);
util::fb2::Mutex m;
pp_.AwaitFiberOnAll([&m](unsigned index, util::ProactorBase* base) {
std::unique_lock lk(m);
util::fb2::detail::FiberInterface::PrintAllFiberStackTraces();
});
});
#endif

shard_set = new EngineShardSet(pp);

// We support less than 1024 threads and we support less than 1024 shards.
Expand Down
8 changes: 8 additions & 0 deletions tests/dragonfly/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ def stop(self, kill=False):
proc.terminate()
proc.communicate(timeout=15)
except subprocess.TimeoutExpired:
# We need to send SIGUSR1 to DF such that it prints the stacktrace
proc.send_signal(signal.SIGUSR1)
# Then we sleep for 5 seconds such that DF has enough time to print the stacktraces
# We can't really synchronize here because SIGTERM and SIGKILL do not block even if
# sigaction explicitly blocks other incoming signals until it handles SIGUSR1.
# Even worse, on SIGTERM and SIGKILL none of the handlers registered via sigaction
# are guranteed to run
time.sleep(5)
logging.debug(f"Unable to kill the process on port {self._port}")
logging.debug(f"INFO LOGS of DF are:")
self.print_info_logs_to_debug_log()
Expand Down

0 comments on commit 1d02e12

Please sign in to comment.