Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Blocking] Fix #3840: Clean up logic for parsing tree_method parameter #3849

Merged
merged 11 commits into from
Nov 2, 2018
49 changes: 49 additions & 0 deletions src/common/enum_class_param.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*!
* Copyright 2015-2018 by Contributors
* \file enum_class_param.h
* \brief macro for using C++11 enum class as DMLC parameter
* \author Hyunsu Philip Cho
*/

#ifndef XGBOOST_COMMON_ENUM_CLASS_PARAM_H_
#define XGBOOST_COMMON_ENUM_CLASS_PARAM_H_

#include <dmlc/parameter.h>
#include <string>
#include <type_traits>

// specialization of FieldEntry for enum class (backed by int)
#define DECLARE_FIELD_ENUM_CLASS(EnumClass) \
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
namespace dmlc { \
namespace parameter { \
template <> \
class FieldEntry<EnumClass> : public FieldEntry<int> { \
public: \
FieldEntry<EnumClass>() { \
static_assert( \
std::is_same<int, typename std::underlying_type<EnumClass>::type>::value, \
"enum class must be backed by int"); \
is_enum_ = true; \
} \
using Super = FieldEntry<int>; \
void Set(void *head, const std::string &value) const override { \
Super::Set(head, value); \
} \
inline FieldEntry<EnumClass>& add_enum(const std::string &key, EnumClass value) { \
Super::add_enum(key, static_cast<int>(value)); \
return *this; \
} \
inline FieldEntry<EnumClass>& set_default(const EnumClass& default_value) { \
default_value_ = static_cast<int>(default_value); \
has_default_ = true; \
return *this; \
} \
/* NOLINTNEXTLINE */ \
inline void Init(const std::string &key, void *head, EnumClass& ref) { \
Super::Init(key, head, *reinterpret_cast<int*>(&ref)); \
} \
}; \
} /* namespace parameter */ \
} /* namespace dmlc */

#endif // XGBOOST_COMMON_ENUM_CLASS_PARAM_H_
177 changes: 134 additions & 43 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,22 @@
#include "./common/host_device_vector.h"
#include "./common/io.h"
#include "./common/random.h"
#include "./common/enum_class_param.h"
#include "common/timer.h"

namespace {

const char* kMaxDeltaStepDefaultValue = "0.7";

enum class TreeMethod : int {
kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,
kGPUExact = 4, kGPUHist = 5
};

} // anonymous namespace

DECLARE_FIELD_ENUM_CLASS(TreeMethod);

namespace xgboost {
// implementation of base learner.
bool Learner::AllowLazyCheckPoint() const {
Expand Down Expand Up @@ -82,7 +90,7 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
// data split mode, can be row, col, or none.
int dsplit;
// tree construction method
int tree_method;
TreeMethod tree_method;
// internal test flag
std::string test_flag;
// number of threads to use if OpenMP is enabled
Expand All @@ -109,13 +117,13 @@ struct LearnerTrainParam : public dmlc::Parameter<LearnerTrainParam> {
.add_enum("row", 2)
.describe("Data split mode for distributed training.");
DMLC_DECLARE_FIELD(tree_method)
.set_default(0)
.add_enum("auto", 0)
.add_enum("approx", 1)
.add_enum("exact", 2)
.add_enum("hist", 3)
.add_enum("gpu_exact", 4)
.add_enum("gpu_hist", 5)
.set_default(TreeMethod::kAuto)
.add_enum("auto", TreeMethod::kAuto)
.add_enum("approx", TreeMethod::kApprox)
.add_enum("exact", TreeMethod::kExact)
.add_enum("hist", TreeMethod::kHist)
.add_enum("gpu_exact", TreeMethod::kGPUExact)
.add_enum("gpu_hist", TreeMethod::kGPUHist)
.describe("Choice of tree construction method.");
DMLC_DECLARE_FIELD(test_flag).set_default("").describe(
"Internal test flag");
Expand Down Expand Up @@ -154,37 +162,49 @@ class LearnerImpl : public Learner {
}

void ConfigureUpdaters() {
if (tparam_.tree_method == 0 || tparam_.tree_method == 1 ||
tparam_.tree_method == 2) {
if (cfg_.count("updater") == 0) {
if (tparam_.dsplit == 1) {
cfg_["updater"] = "distcol";
} else if (tparam_.dsplit == 2) {
cfg_["updater"] = "grow_histmaker,prune";
}
}
} else if (tparam_.tree_method == 3) {
/* histogram-based algorithm */
LOG(CONSOLE) << "Tree method is selected to be \'hist\', which uses a "
"single updater "
<< "grow_fast_histmaker.";
/* Choose updaters according to tree_method parameters */
if (cfg_.count("updater") > 0) {
LOG(CONSOLE) << "DANGER AHEAD: You have manually specified `updater` "
"parameter. The `tree_method` parameter will be ignored. "
"Incorrect sequence of updaters will produce undefined "
"behavior. For common uses, we recommend using "
"`tree_method` parameter instead.";
return;
}

switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx'
// This choice is deferred to PerformTreeMethodHeuristic().
break;
case TreeMethod::kApprox:
cfg_["updater"] = "grow_histmaker,prune";
break;
case TreeMethod::kExact:
cfg_["updater"] = "grow_colmaker,prune";
break;
case TreeMethod::kHist:
LOG(CONSOLE) << "Tree method is selected to be 'hist', which uses a "
"single updater grow_fast_histmaker.";
cfg_["updater"] = "grow_fast_histmaker";
} else if (tparam_.tree_method == 4) {
break;
case TreeMethod::kGPUExact:
this->AssertGPUSupport();
if (cfg_.count("updater") == 0) {
cfg_["updater"] = "grow_gpu,prune";
}
cfg_["updater"] = "grow_gpu,prune";
if (cfg_.count("predictor") == 0) {
cfg_["predictor"] = "gpu_predictor";
}
} else if (tparam_.tree_method == 5) {
break;
case TreeMethod::kGPUHist:
this->AssertGPUSupport();
if (cfg_.count("updater") == 0) {
cfg_["updater"] = "grow_gpu_hist";
}
cfg_["updater"] = "grow_gpu_hist";
if (cfg_.count("predictor") == 0) {
cfg_["predictor"] = "gpu_predictor";
}
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
}
}

Expand Down Expand Up @@ -376,7 +396,7 @@ class LearnerImpl : public Learner {
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
this->PerformTreeMethodHeuristic(train);
monitor_.Start("PredictRaw");
this->PredictRaw(train, &preds_);
monitor_.Stop("PredictRaw");
Expand All @@ -393,7 +413,7 @@ class LearnerImpl : public Learner {
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->LazyInitDMatrix(train);
this->PerformTreeMethodHeuristic(train);
gbm_->DoBoost(train, in_gpair);
monitor_.Stop("BoostOneIter");
}
Expand Down Expand Up @@ -479,21 +499,92 @@ class LearnerImpl : public Learner {
}

protected:
// check if p_train is ready to used by training.
// if not, initialize the column access.
inline void LazyInitDMatrix(DMatrix* p_train) {
if (tparam_.tree_method == 3 || tparam_.tree_method == 4 ||
tparam_.tree_method == 5 || name_gbm_ == "gblinear") {
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix
inline void PerformTreeMethodHeuristic(DMatrix* p_train) {
if (name_gbm_ != "gbtree" || cfg_.count("updater") > 0) {
// 1. This method is not applicable for non-tree learners
// 2. This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}

if (!p_train->SingleColBlock() && cfg_.count("updater") == 0) {
if (tparam_.tree_method == 2) {
LOG(CONSOLE) << "tree method is set to be 'exact',"
<< " but currently we are only able to proceed with "
"approximate algorithm";
const TreeMethod current_tree_method = tparam_.tree_method;
if (rabit::IsDistributed()) {
/* Choose tree_method='approx' when distributed training is activated */
CHECK(tparam_.dsplit != 0)
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
<< "Precondition violated; dsplit cannot be zero in distributed mode";
if (tparam_.dsplit == 1) {
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
LOG(FATAL) << "Column-wise data split is currently not supported";
}
cfg_["updater"] = "grow_histmaker,prune";
switch (current_tree_method) {
case TreeMethod::kAuto:
LOG(CONSOLE) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
break;
case TreeMethod::kApprox:
// things are okay, do nothing
break;
case TreeMethod::kExact:
case TreeMethod::kHist:
LOG(CONSOLE) << "Tree method was set to be '"
<< (current_tree_method == TreeMethod::kExact ?
"exact" : "hist")
<< "', but only 'approx' is available for distributed "
"training. The `tree_method` parameter is now being "
"changed to 'approx'";
break;
case TreeMethod::kGPUExact:
case TreeMethod::kGPUHist:
LOG(FATAL) << "Distributed training is not available with GPU algoritms";
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(current_tree_method) << ") detected";
}
tparam_.tree_method = TreeMethod::kApprox;
} else if (!p_train->SingleColBlock()) {
/* Some tree methods are not available for external-memory DMatrix */
switch (current_tree_method) {
case TreeMethod::kAuto:
LOG(CONSOLE) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
break;
case TreeMethod::kApprox:
// things are okay, do nothing
break;
case TreeMethod::kExact:
LOG(CONSOLE) << "Tree method was set to be 'exact', "
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
"but currently we are only able to proceed with "
"approximate algorithm ('approx') because external-"
"memory data matrix is used.";
break;
case TreeMethod::kHist:
// things are okay, do nothing
break;
case TreeMethod::kGPUExact:
case TreeMethod::kGPUHist:
LOG(FATAL)
<< "External-memory data matrix is not available with GPU algorithms";
break;
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(current_tree_method) << ") detected";
}
tparam_.tree_method = TreeMethod::kApprox;
} else if (p_train->Info().num_row_ >= (4UL << 20UL)
&& current_tree_method == TreeMethod::kAuto) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(CONSOLE) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
}

/* If tree_method was changed, re-configure updaters and gradient boosters */
if (tparam_.tree_method != current_tree_method) {
ConfigureUpdaters();
if (gbm_ != nullptr) {
gbm_->Configure(cfg_.begin(), cfg_.end());
}
Expand Down
55 changes: 55 additions & 0 deletions tests/cpp/common/test_enum_class_param.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "../../../src/common/enum_class_param.h"
#include <dmlc/parameter.h>
#include <gtest/gtest.h>

enum class Foo : int {
kBar = 0, kFrog = 1, kCat = 2, kDog = 3
};

DECLARE_FIELD_ENUM_CLASS(Foo);

struct MyParam : dmlc::Parameter<MyParam> {
Foo foo;
int bar;
DMLC_DECLARE_PARAMETER(MyParam) {
DMLC_DECLARE_FIELD(foo)
.set_default(Foo::kBar)
.add_enum("bar", Foo::kBar)
.add_enum("frog", Foo::kFrog)
.add_enum("cat", Foo::kCat)
.add_enum("dog", Foo::kDog);
DMLC_DECLARE_FIELD(bar)
.set_default(-1);
}
};

DMLC_REGISTER_PARAMETER(MyParam);

TEST(EnumClassParam, Basic) {
MyParam param;
std::map<std::string, std::string> kwargs{
{"foo", "frog"}, {"bar", "10"}
};
// try initializing
param.Init(kwargs);
ASSERT_EQ(param.foo, Foo::kFrog);
ASSERT_EQ(param.bar, 10);

// try all possible enum values
kwargs["foo"] = "bar";
param.Init(kwargs);
ASSERT_EQ(param.foo, Foo::kBar);
kwargs["foo"] = "frog";
param.Init(kwargs);
ASSERT_EQ(param.foo, Foo::kFrog);
kwargs["foo"] = "cat";
param.Init(kwargs);
ASSERT_EQ(param.foo, Foo::kCat);
kwargs["foo"] = "dog";
param.Init(kwargs);
ASSERT_EQ(param.foo, Foo::kDog);

// try setting non-existent enum value
kwargs["foo"] = "human";
ASSERT_THROW(param.Init(kwargs), dmlc::ParamError);
}