Skip to content

Commit

Permalink
KUDU-2388. Fix TSAN race in MaintenanceManager::UnregisterOp
Browse files Browse the repository at this point in the history
This fixes the following TSAN race:

WARNING: ThreadSanitizer: data race (pid=17822)  Read of size 1 at 0x7b4c000054e8 by thread T59 (mutexes: write M1750):
...
    #3 strings::internal::SubstituteArg::SubstituteArg(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/gutil/strings/substitute.h:76 (libtserver.so+0x9edb0)
    #4 kudu::MaintenanceManager::LogPrefix() const /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:545:31 (libkudu_util.so+0x167791)
    #5 kudu::MaintenanceManager::UnregisterOp(kudu::MaintenanceOp*) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:235:3 (libkudu_util.so+0x165963)
    #6 kudu::MaintenanceOp::Unregister() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:123:13 (libkudu_util.so+0x1654fe)
    #7 kudu::tablet::Tablet::UnregisterMaintenanceOps() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet.cc:1405:9 (libtablet.so+0xfb5af)
    #8 kudu::tablet::TabletReplica::Stop() /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/tablet/tablet_replica.cc:271:25 (libtablet.so+0x146e66)
    #9 kudu::tserver::TSTabletManager::DeleteTablet(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > c

  Previous write of size 8 at 0x7b4c000054e8 by main thread:
    #0 memcpy /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/src/llvm-4.0.0.src/projects/compiler-rt/lib/tsan/../sanitizer_common/sanitizer_common_interceptors.inc:655 (kudu-tserver+0x449e4c)
    #1 std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__move_assign(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::integral_constant<bool, true>) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2044:18 (libkudu_util.so+0x16664d)
    #2 std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::operator=(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&&) /data/somelongdirectorytoavoidrpathissues/src/kudu/thirdparty/installed/tsan/include/c++/v1/string:2055 (libkudu_util.so+0x16664d)
    #3 kudu::MaintenanceManager::Init(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >) /data/somelongdirectorytoavoidrpathissues/src/kudu/src/kudu/util/maintenance_manager.cc:169 (libkudu_util.so+0x16664d)
...

The race is on the 'server_uuid_' field in the MaintenanceManager. This is set
during startup, but was being set _after_ calls such as UnregisterOp could be
made as seen above. That means the UnregisterOp call could either see an empty
UUID or even crash due to the above race.

This simply rejiggers the MaintenanceManager startup to take the UUID in as a
constructor parameter instead, and to instantiate the object slightly later
during startup.

Change-Id: Id06731f56eb98146f7b88541b936c0026b781b16
Reviewed-on: http://gerrit.cloudera.org:8080/9866
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <[email protected]>
  • Loading branch information
toddlipcon committed Mar 30, 2018
1 parent b2b8fb7 commit 51e2ca4
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 16 deletions.
8 changes: 5 additions & 3 deletions src/kudu/master/master.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ Master::Master(const MasterOptions& opts)
catalog_manager_(new CatalogManager(this)),
path_handlers_(new MasterPathHandlers(this)),
opts_(opts),
registration_initialized_(false),
maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) {
registration_initialized_(false) {
}

Master::~Master() {
Expand All @@ -128,6 +127,9 @@ Status Master::Init() {
RETURN_NOT_OK(path_handlers_->Register(web_server_.get()));
}

maintenance_manager_.reset(new MaintenanceManager(
MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));

// The certificate authority object is initialized upon loading
// CA private key and certificate from the system table when the server
// becomes a leader.
Expand All @@ -152,7 +154,7 @@ Status Master::Start() {
Status Master::StartAsync() {
CHECK_EQ(kInitialized, state_);

RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid()));
RETURN_NOT_OK(maintenance_manager_->Start());

gscoped_ptr<ServiceIf> impl(new MasterServiceImpl(this));
gscoped_ptr<ServiceIf> consensus_service(new ConsensusServiceImpl(
Expand Down
8 changes: 5 additions & 3 deletions src/kudu/tserver/tablet_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ TabletServer::TabletServer(const TabletServerOptions& opts)
opts_(opts),
tablet_manager_(new TSTabletManager(this)),
scanner_manager_(new ScannerManager(metric_entity())),
path_handlers_(new TabletServerPathHandlers(this)),
maintenance_manager_(new MaintenanceManager(MaintenanceManager::kDefaultOptions)) {
path_handlers_(new TabletServerPathHandlers(this)) {
}

TabletServer::~TabletServer() {
Expand Down Expand Up @@ -95,6 +94,9 @@ Status TabletServer::Init() {
RETURN_NOT_OK(path_handlers_->Register(web_server_.get()));
}

maintenance_manager_.reset(new MaintenanceManager(
MaintenanceManager::kDefaultOptions, fs_manager_->uuid()));

heartbeater_.reset(new Heartbeater(opts_, this));

RETURN_NOT_OK_PREPEND(tablet_manager_->Init(),
Expand Down Expand Up @@ -130,7 +132,7 @@ Status TabletServer::Start() {
RETURN_NOT_OK(KuduServer::Start());

RETURN_NOT_OK(heartbeater_->Start());
RETURN_NOT_OK(maintenance_manager_->Init(fs_manager_->uuid()));
RETURN_NOT_OK(maintenance_manager_->Start());

google::FlushLogFiles(google::INFO); // Flush the startup messages.

Expand Down
4 changes: 2 additions & 2 deletions src/kudu/util/maintenance_manager-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ class MaintenanceManagerTest : public KuduTest {
options.num_threads = 2;
options.polling_interval_ms = 1;
options.history_size = kHistorySize;
manager_.reset(new MaintenanceManager(options));
manager_.reset(new MaintenanceManager(options, kFakeUuid));
manager_->set_memory_pressure_func_for_tests(
[&](double* consumption) {
return indicate_memory_pressure_.load();
});
ASSERT_OK(manager_->Init(kFakeUuid));
ASSERT_OK(manager_->Start());
}

void TearDown() override {
Expand Down
12 changes: 7 additions & 5 deletions src/kudu/util/maintenance_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,11 @@ const MaintenanceManager::Options MaintenanceManager::kDefaultOptions = {
.history_size = 0,
};

MaintenanceManager::MaintenanceManager(const Options& options)
: num_threads_(options.num_threads <= 0 ?
FLAGS_maintenance_manager_num_threads : options.num_threads),
MaintenanceManager::MaintenanceManager(const Options& options,
std::string server_uuid)
: server_uuid_(std::move(server_uuid)),
num_threads_(options.num_threads <= 0 ?
FLAGS_maintenance_manager_num_threads : options.num_threads),
cond_(&lock_),
shutdown_(false),
polling_interval_ms_(options.polling_interval_ms <= 0 ?
Expand All @@ -165,8 +167,8 @@ MaintenanceManager::~MaintenanceManager() {
Shutdown();
}

Status MaintenanceManager::Init(std::string server_uuid) {
server_uuid_ = std::move(server_uuid);
Status MaintenanceManager::Start() {
CHECK(!monitor_thread_);
RETURN_NOT_OK(Thread::Create("maintenance", "maintenance_scheduler",
boost::bind(&MaintenanceManager::RunSchedulerThread, this),
&monitor_thread_));
Expand Down
8 changes: 5 additions & 3 deletions src/kudu/util/maintenance_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,12 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage
uint32_t history_size;
};

explicit MaintenanceManager(const Options& options);
MaintenanceManager(const Options& options, std::string server_uuid);
~MaintenanceManager();

Status Init(std::string server_uuid);
// Start running the maintenance manager.
// Must be called at most once.
Status Start();
void Shutdown();

// Register an op with the manager.
Expand Down Expand Up @@ -318,6 +320,7 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage

std::string LogPrefix() const;

const std::string server_uuid_;
const int32_t num_threads_;
OpMapTy ops_; // registered operations
Mutex lock_;
Expand All @@ -331,7 +334,6 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage
// the completed_ops_count_ % the vector's size and then the count needs to be incremented.
std::vector<OpInstance> completed_ops_;
int64_t completed_ops_count_;
std::string server_uuid_;
Random rand_;

// Function which should return true if the server is under global memory pressure.
Expand Down

0 comments on commit 51e2ca4

Please sign in to comment.