better document for bin_construct_sample_cnt (#3521)

* better document for bin_construct_sample_cnt * add warnings Co-authored-by: StrikerRUS <[email protected]>
microsoft · Nov 6, 2020 · bee732a · bee732a
1 parent 335ba23
commit bee732a
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 4 deletions.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -634,12 +634,14 @@ Dataset Parameters
 
 -  ``bin_construct_sample_cnt`` :raw-html:`<a id="bin_construct_sample_cnt" title="Permalink to this parameter" href="#bin_construct_sample_cnt">&#x1F517;&#xFE0E;</a>`, default = ``200000``, type = int, aliases: ``subsample_for_bin``, constraints: ``bin_construct_sample_cnt > 0``
 
-   -  number of data that sampled to construct histogram bins
+   -  number of data that sampled to construct feature discrete bins
 
-   -  setting this to larger value will give better training result, but will increase data loading time
+   -  setting this to larger value will give better training result, but may increase data loading time
 
    -  set this to larger value if data is very sparse
 
+   -  **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
+
 -  ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``data_seed``
 
    -  random seed for sampling data to construct histogram bins

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -580,9 +580,10 @@ struct Config {
 
   // alias = subsample_for_bin
   // check = >0
-  // desc = number of data that sampled to construct histogram bins
-  // desc = setting this to larger value will give better training result, but will increase data loading time
+  // desc = number of data that sampled to construct feature discrete bins
+  // desc = setting this to larger value will give better training result, but may increase data loading time
   // desc = set this to larger value if data is very sparse
+  // desc = **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
   int bin_construct_sample_cnt = 200000;
 
   // alias = data_seed

diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
@@ -164,6 +164,16 @@ void DatasetLoader::SetHeader(const char* filename) {
   }
 }
 
+void CheckSampleSize(size_t sample_cnt, size_t num_data) {
+  if (static_cast<double>(sample_cnt) / num_data < 0.2f &&
+      sample_cnt < 100000) {
+    Log::Warning(
+        "Using too small ``bin_construct_sample_cnt`` may encounter "
+        "unexpected "
+        "errors and poor accuracy.");
+  }
+}
+
 Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
   // don't support query id in data file when training in parallel
   if (num_machines > 1 && !config_.pre_partition) {
@@ -190,6 +200,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
       dataset->num_data_ = static_cast<data_size_t>(text_data.size());
       // sample data
       auto sample_data = SampleTextDataFromMemory(text_data);
+      CheckSampleSize(sample_data.size(),
+                      static_cast<size_t>(dataset->num_data_));
       // construct feature bin mappers
       ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
       // initialize label
@@ -205,6 +217,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
       } else {
         dataset->num_data_ = num_global_data;
       }
+      CheckSampleSize(sample_data.size(),
+                      static_cast<size_t>(dataset->num_data_));
       // construct feature bin mappers
       ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
       // initialize label
@@ -540,6 +554,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
 Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
                                                 int** sample_indices, int num_col, const int* num_per_col,
                                                 size_t total_sample_size, data_size_t num_data) {
+  CheckSampleSize(total_sample_size, static_cast<size_t>(num_data));
   int num_total_features = num_col;
   if (Network::num_machines() > 1) {
     num_total_features = Network::GlobalSyncUpByMax(num_total_features);