From 51e2ca40d17d34e953d188432afa1d4025d45b99 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Thu, 29 Mar 2018 20:17:08 -0700 Subject: [PATCH] KUDU-2388. Fix TSAN race in MaintenanceManager::UnregisterOp This fixes the following TSAN race: WARNING: ThreadSanitizer: data race (pid=17822) Read of size 1 at 0x7b4c000054e8 by thread T59 (mutexes: write M1750): ... #3 strings::internal::SubstituteArg::SubstituteArg(std::__1::basic_string, std::__1::allocator > const&) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/gutil/strings/substitute.h:76 (libtserver.so+0x9edb0) #4 kudu::MaintenanceManager::LogPrefix() const /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:545:31 (libkudu_util.so+0x167791) #5 kudu::MaintenanceManager::UnregisterOp(kudu::MaintenanceOp*) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:235:3 (libkudu_util.so+0x165963) #6 kudu::MaintenanceOp::Unregister() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:123:13 (libkudu_util.so+0x1654fe) #7 kudu::tablet::Tablet::UnregisterMaintenanceOps() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet.cc:1405:9 (libtablet.so+0xfb5af) #8 kudu::tablet::TabletReplica::Stop() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet_replica.cc:271:25 (libtablet.so+0x146e66) #9 kudu::tserver::TSTabletManager::DeleteTablet(std::__1::basic_string, std::__1::allocator > c Previous write of size 8 at 0x7b4c000054e8 by main thread: #0 memcpy /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/src/llvm-4.0.0.src/projects/compiler-rt/lib/tsan/../sanitizer_common/sanitizer_common_interceptors.inc:655 (kudu-tserver+0x449e4c) #1 std::__1::basic_string, std::__1::allocator >::__move_assign(std::__1::basic_string, std::__1::allocator >&, std::__1::integral_constant) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2044:18 (libkudu_util.so+0x16664d) #2 std::__1::basic_string, std::__1::allocator >::operator=(std::__1::basic_string, std::__1::allocator >&&) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2055 (libkudu_util.so+0x16664d) #3 kudu::MaintenanceManager::Init(std::__1::basic_string, std::__1::allocator >) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:169 (libkudu_util.so+0x16664d) ... The race is on the 'server_uuid_' field in the MaintenanceManager. This is set during startup, but was being set _after_ calls such as UnregisterOp could be made as seen above. That means the UnregisterOp call could either see an empty UUID or even crash due to the above race. This simply rejiggers the MaintenanceManager startup to take the UUID in as a constructor parameter instead, and to instantiate the object slightly later during startup. Change-Id: Id06731f56eb98146f7b88541b936c0026b781b16 Reviewed-on: http://gerrit.cloudera.org:8080/9866 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo --- src/kudu/master/master.cc | 8 +++++--- src/kudu/tserver/tablet_server.cc | 8 +++++--- src/kudu/util/maintenance_manager-test.cc | 4 ++-- src/kudu/util/maintenance_manager.cc | 12 +++++++----- src/kudu/util/maintenance_manager.h | 8 +++++--- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc index ad09f47e0b..ce16a461e4 100644 --- a/src/kudu/master/master.cc +++ b/src/kudu/master/master.cc @@ -100,8 +100,7 @@ Master::Master(const MasterOptions& opts) catalog_manager_(new CatalogManager(this)), path_handlers_(new MasterPathHandlers(this)), opts_(opts), - registration_initialized_(false), - maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) { + registration_initialized_(false) { } Master::~Master() { @@ -128,6 +127,9 @@ Status Master::Init() { RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); } + maintenance_manager_.reset(new MaintenanceManager( + MaintenanceManager::kDefaultOptions, fs_manager_->uuid())); + // The certificate authority object is initialized upon loading // CA private key and certificate from the system table when the server // becomes a leader. @@ -152,7 +154,7 @@ Status Master::Start() { Status Master::StartAsync() { CHECK_EQ(kInitialized, state_); - RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid())); + RETURN_NOT_OK(maintenance_manager_->Start()); gscoped_ptr impl(new MasterServiceImpl(this)); gscoped_ptr consensus_service(new ConsensusServiceImpl( diff --git a/src/kudu/tserver/tablet_server.cc b/src/kudu/tserver/tablet_server.cc index 8fdb85d652..efbbe03ed6 100644 --- a/src/kudu/tserver/tablet_server.cc +++ b/src/kudu/tserver/tablet_server.cc @@ -56,8 +56,7 @@ TabletServer::TabletServer(const TabletServerOptions& opts) opts_(opts), tablet_manager_(new TSTabletManager(this)), scanner_manager_(new ScannerManager(metric_entity())), - path_handlers_(new TabletServerPathHandlers(this)), - maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) { + path_handlers_(new TabletServerPathHandlers(this)) { } TabletServer::~TabletServer() { @@ -95,6 +94,9 @@ Status TabletServer::Init() { RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); } + maintenance_manager_.reset(new MaintenanceManager( + MaintenanceManager::kDefaultOptions, fs_manager_->uuid())); + heartbeater_.reset(new Heartbeater(opts_, this)); RETURN_NOT_OK_PREPEND(tablet_manager_->Init(), @@ -130,7 +132,7 @@ Status TabletServer::Start() { RETURN_NOT_OK(KuduServer::Start()); RETURN_NOT_OK(heartbeater_->Start()); - RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid())); + RETURN_NOT_OK(maintenance_manager_->Start()); google::FlushLogFiles(google::INFO); // Flush the startup messages. diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc index 1889139e4d..6777e06b81 100644 --- a/src/kudu/util/maintenance_manager-test.cc +++ b/src/kudu/util/maintenance_manager-test.cc @@ -70,12 +70,12 @@ class MaintenanceManagerTest : public KuduTest { options.num_threads = 2; options.polling_interval_ms = 1; options.history_size = kHistorySize; - manager_.reset(new MaintenanceManager(options)); + manager_.reset(new MaintenanceManager(options, kFakeUuid)); manager_->set_memory_pressure_func_for_tests( [&](double* consumption) { return indicate_memory_pressure_.load(); }); - ASSERT_OK(manager_->Init(kFakeUuid)); + ASSERT_OK(manager_->Start()); } void TearDown() override { diff --git a/src/kudu/util/maintenance_manager.cc b/src/kudu/util/maintenance_manager.cc index 2694d6c94f..9a42464028 100644 --- a/src/kudu/util/maintenance_manager.cc +++ b/src/kudu/util/maintenance_manager.cc @@ -141,9 +141,11 @@ const MaintenanceManager::Options MaintenanceManager::kDefaultOptions = { .history_size = 0, }; -MaintenanceManager::MaintenanceManager(const Options& options) - : num_threads_(options.num_threads <= 0 ? - FLAGS_maintenance_manager_num_threads : options.num_threads), +MaintenanceManager::MaintenanceManager(const Options& options, + std::string server_uuid) + : server_uuid_(std::move(server_uuid)), + num_threads_(options.num_threads <= 0 ? + FLAGS_maintenance_manager_num_threads : options.num_threads), cond_(&lock_), shutdown_(false), polling_interval_ms_(options.polling_interval_ms <= 0 ? @@ -165,8 +167,8 @@ MaintenanceManager::~MaintenanceManager() { Shutdown(); } -Status MaintenanceManager::Init(std::string server_uuid) { - server_uuid_ = std::move(server_uuid); +Status MaintenanceManager::Start() { + CHECK(!monitor_thread_); RETURN_NOT_OK(Thread::Create("maintenance", "maintenance_scheduler", boost::bind(&MaintenanceManager::RunSchedulerThread, this), &monitor_thread_)); diff --git a/src/kudu/util/maintenance_manager.h b/src/kudu/util/maintenance_manager.h index dc36f67a21..7d20c8a48a 100644 --- a/src/kudu/util/maintenance_manager.h +++ b/src/kudu/util/maintenance_manager.h @@ -274,10 +274,12 @@ class MaintenanceManager : public std::enable_shared_from_this completed_ops_; int64_t completed_ops_count_; - std::string server_uuid_; Random rand_; // Function which should return true if the server is under global memory pressure.