Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

Closed
aldanor opened this issue Nov 27, 2020 · 11 comments

Comments

@aldanor
Copy link

aldanor commented Nov 27, 2020

Environment setup:

  • linux-64
  • numpy=1.18
  • lightgbm 3.0.0 or 3.1.0 (crashes in both cases), conda-forge version or built manually with clang/iomp5 (crashes in both cases)

(Should lightgbm run this kind of testing on CI to catch these sort of errors as early as possible?)

Test script (click to expand)
from dataclasses import dataclass
import random
from typing import Any, Dict, List, Final, Optional

import lightgbm as lgb
import numpy as np
from sklearn.datasets import make_classification


MIN_FOREST_DEPTH: Final[int] = 1
MAX_FOREST_DEPTH: Final[int] = 8


def make_lightgbm_params(
    params: Dict[str, Any],
    mono: Optional[np.ndarray],
    depth: int,
    num_trees: int,
    n_jobs: int,
    seed: int,
) -> Dict[str, Any]:
    assert MIN_FOREST_DEPTH <= depth <= MAX_FOREST_DEPTH
    params = dict(params)
    assert num_trees >= 1
    params['objective'] = 'binary'
    params['tree_learner'] = 'serial'
    params['num_leaves'] = 2 ** depth
    params['verbosity'] = -1
    params['num_threads'] = n_jobs
    params['seed'] = seed
    params['histogram_pool_size'] = -1
    params['max_depth'] = depth
    if mono is not None and (mono != 0).any():
        params['monotone_constraints'] = mono.tolist()
    return params


@dataclass
class TrainData:
    signals: np.ndarray
    labels: np.ndarray
    feature_names: List[str]
    mono: Dict[str, int]

    @property
    def n_features(self) -> int:
        return int(self.signals.shape[1])

    @property
    def mono_arr(self) -> np.ndarray:
        return np.array([self.mono.get(f, 0) for f in self.feature_names]).astype(np.int8)

    @classmethod
    def generate(cls, random_state: int, dtype: type):
        state = random.getstate()
        random.seed(random_state)
        n_samples = random.randint(100, 10_000)
        n_features = random.randint(10, 40)
        ds = make_classification(n_samples, n_features, random_state=random_state)
        signals = np.clip((ds[0] * 150).astype(dtype).round(), -511, 511)
        labels = ds[1] > 0
        feature_names = list(map('f{:03d}'.format, range(n_features)))
        if random_state % 2 == 0:
            mono = {}
        else:
            mono = {f: (i % 3) - 1 for i, f in enumerate(feature_names)}
            if random_state % 4 == 0:
                mono['x'] = 1
        random.setstate(state)
        return cls(
            signals=signals,
            labels=labels,
            feature_names=feature_names,
            mono=mono,
        )


def generate_lightgbm_params(random_state: int, depth: int) -> Dict[str, Any]:
    state = random.getstate()
    random.seed(random_state)
    params_pool = dict(
        boosting=['gbdt', 'rf', 'dart', 'goss'],
        learning_rate=[1e-6, 0.01, 0.1],
        min_data_in_leaf=[0, 10],
        min_sum_hessian_in_leaf=[0.0, 0.01],
        bagging_fraction=[0.75, 1.0],
        pos_bagging_fraction=[0.75, 1.0],
        neg_bagging_fraction=[0.75, 1.0],
        bagging_freq=[0, 2],
        feature_fraction=[0.75, 1.0],
        feature_fraction_bynode=[0.75, 1.0],
        max_delta_step=[0.0, 0.01],
        lambda_l1=[0.0, 0.01],
        lambda_l2=[0.0, 0.01],
        min_gain_to_split=[0.0, 0.01],
        drop_rate=[0.0, 0.5, 1.0],
        max_drop=[0, 100],
        skip_drop=[0.0, 0.5, 1.0],
        xgboost_dart_mode=[True, False],
        uniform_drop=[True, False],
        refit_decay_rate=[0.0, 0.5, 1.0],
        max_bin=[2, 10, 255],
        min_data_in_bin=[1, 25],
        bin_construct_sample_cnt=[1, 50],
        is_unbalance=[True, False],
        scale_pos_weight=[0.01, 1.0, 1e2],
        sigmoid=[0.01, 0.5],
        boost_from_average=[True, False],
        #mc_method=['basic', 'intermediate', 'advanced'],
        #mc_penalty=[0.0, 7.0],
    )
    params = {
        k: v[random.randint(0, len(v) - 1)]
        for k, v in params_pool.items()
        if random.random() >= 0.1
    }
    if params.get('min_data_in_leaf', 20) == 0:
        if params.get('min_sum_hessian_in_leaf', 1e-3) == 0:
            params['min_data_in_leaf'] = 1
    if params.get('is_unbalance') and 'scale_pos_weight' in params:
        params['scale_pos_weight'] = 1.0
    top_rate, other_rate = random.choice([(-1, 0.8), (-1, 0.01), (0.9, -1), (0.01, -1), (-1, -1)])
    if top_rate != -1:
        params['top_rate'] = top_rate
    if other_rate != -1:
        params['other_rate'] = other_rate
    if params.get('boosting') == 'goss':
        if 'bagging_fraction' in params:
            params['bagging_fraction'] = 1.0
    if params.get('boosting') == 'rf':
        if params.get('bagging_freq', 0) == 0:
            params['bagging_freq'] = 2
        if params.get('bagging_fraction', 1.0) == 1.0:
            params['bagging_fraction'] = 0.5
    if params.get('mc_method', 'basic') != 'basic':
        if params.get('feature_fraction', 1) != 1:
            params['feature_fraction'] = 1
        if params.get('feature_fraction_bynode', 1) != 1:
            params['feature_fraction_bynode'] = 1
    if params.get('mc_penalty'):
        params['mc_penalty'] = min(params['mc_penalty'], float(depth) - 0.01)
    random.setstate(state)
    return params


def test_lightgbm(random_state: int):
    random.seed(random_state)
    depth = random.randint(MIN_FOREST_DEPTH, MAX_FOREST_DEPTH)
    num_trees = random.randint(1, 50)

    d = TrainData.generate(random_state, np.float32)
    params = {} if random_state == 0 else generate_lightgbm_params(random_state, depth)

    params = make_lightgbm_params(params, d.mono_arr, depth, num_trees, 1, 42)
    lgb_kwargs = dict(
        params=params,
        train_set=lgb.Dataset(d.signals, d.labels),
        num_boost_round=num_trees,
        verbose_eval=False,
    )

    print(f'[#{i}]')
    print(f'{d.signals.shape=}, {depth=}, {num_trees=}')
    print(params)
    print(flush=True)
    lgb.train(**lgb_kwargs)


if __name__ == '__main__':
    for i in range(1000):
        test_lightgbm(i)

On lightgbm 2.3.1, the script finishes successfully.

Starting from version 3.0.0, it fails after a few iterations like this (the one below is from 3.1.0):

[#6]
d.signals.shape=(9501, 36), depth=2, num_trees=32
{'boosting': 'gbdt',
 'learning_rate': 0.01,
 'min_sum_hessian_in_leaf': 0.01,
 'bagging_fraction': 1.0,
 'pos_bagging_fraction': 0.75,
 'neg_bagging_fraction': 0.75,
 'bagging_freq': 2,
 'feature_fraction': 0.75,
 'feature_fraction_bynode': 1.0,
 'max_delta_step': 0.0,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'min_gain_to_split': 0.01,
 'drop_rate': 0.0,
 'max_drop': 100,
 'uniform_drop': False,
 'refit_decay_rate': 0.0,
 'max_bin': 255,
 'min_data_in_bin': 1,
 'bin_construct_sample_cnt': 1,
 'is_unbalance': False,
 'scale_pos_weight': 0.01,
 'boost_from_average': True,
 'other_rate': 0.8,
 'objective': 'binary',
 'tree_learner': 'serial',
 'num_leaves': 4,
 'verbosity': -1,
 'num_threads': 1,
 'seed': 42,
 'histogram_pool_size': -1,
 'max_depth': 2}


[LightGBM] [Fatal] Check failed: (best_split_info.right_count) > (0) 
    at /tmp/pip-req-build-kncfj4kn/compile/src/treelearner/serial_tree_learner.cpp, line 661 .

“python test_forest.py” terminated by signal SIGSEGV (Address boundary error)

By altering the generator a bit, you can also get a segfault from malloc about corrupted linked lists etc.

@aldanor
Copy link
Author

aldanor commented Nov 27, 2020

If you run this through gdb, you get this: so, a double free in Dataset? (although this may not be the source of the problem but rather a consequence)

*** Error in `./env/bin/python': double free or corruption (out): 0x00005555567151d0 ***

#5  0x00007fffd3d61dd9 in LightGBM::Dataset::~Dataset() ()
   from ./env/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so

@guolinke
Copy link
Collaborator

I guess bin_construct_sample_cnt=1 is the root cause. We alreay add warnings about this.

@guolinke
Copy link
Collaborator

Refer #3521

@aldanor
Copy link
Author

aldanor commented Nov 27, 2020

I'll try rerunning the example and report back. What's the minimum sensible value, something like 257 so it never chooses int8? That's not clear. Or actually, hold on, it's not well-defined, because it has to be 257 unique values observed, not the sample count.

Wonder if it would be possible to throw a preliminary exception (saying 'this has to be > ...') instead of a segfault?

@aldanor
Copy link
Author

aldanor commented Nov 27, 2020

I've updated the script to use [10_000, 200_000] for bin_construct_sample_cnt. It passed a hundred cases and then fails the same way on these parameters (note that bin_construct_sample_cnt = 200_000 here, is that be cause the dataset is just 5.4k samples?...):

[#150]
d.signals.shape=(5447, 38), depth=6, num_trees=26
{'boosting': 'goss',
 'learning_rate': 0.01,
 'min_data_in_leaf': 1,
 'min_sum_hessian_in_leaf': 0.0,
 'bagging_fraction': 1.0,
 'pos_bagging_fraction': 1.0,
 'neg_bagging_fraction': 1.0,
 'feature_fraction': 0.75,
 'feature_fraction_bynode': 0.75,
 'max_delta_step': 0.0,
 'lambda_l1': 0.01,
 'lambda_l2': 0.0,
 'drop_rate': 1.0,
 'max_drop': 100,
 'skip_drop': 1.0,
 'xgboost_dart_mode': True,
 'uniform_drop': False,
 'refit_decay_rate': 0.0,
 'max_bin': 10,
 'min_data_in_bin': 25,
 'bin_construct_sample_cnt': 200000,
 'is_unbalance': True,
 'scale_pos_weight': 1.0,
 'sigmoid': 0.01,
 'other_rate': 0.8,
 'objective': 'binary',
 'tree_learner': 'serial',
 'num_leaves': 64,
 'verbosity': -1,
 'num_threads': 1,
 'seed': 42,
 'histogram_pool_size': -1,
 'max_depth': 6}


[LightGBM] [Fatal] Check failed: (best_split_info.left_count) > (0) 
    at /tmp/pip-req-build-kncfj4kn/compile/src/treelearner/serial_tree_learner.cpp, line 651 .

@aldanor
Copy link
Author

aldanor commented Nov 27, 2020

@guolinke Any thoughts on the last case? (is that another implicit consequence of histogram binning changes, or something else?)

@guolinke
Copy link
Collaborator

guolinke commented Nov 28, 2020

@aldanor we remove the count histogram, and use hessian to estimate count.
Therefore, for the non-const hessian objective (like binary you use), with very small num_data_per_leaf (like 1 you used, and min_sum_hessian_in_leaf=0), the count could be estimated incorrectly, and cause the check fail.

We suggest to use larger num_data_per_leaf, and non-zero min_sum_hessian_in_leaf, to reduce the chance of estimation error.

@aldanor
Copy link
Author

aldanor commented Nov 28, 2020

Thanks. So does that mean num_data_per_leaf should be avoided now in general? (I'll try rerunning this later to see if it works)

@guolinke
Copy link
Collaborator

@aldanor avoid to set it num_data_per_leaf to small values (e.g. smaller than 10), or avoid set min_sum_hessian_in_leaf to zero.

@aldanor
Copy link
Author

aldanor commented Nov 30, 2020

@guolinke Thanks, that seems to have fixed it (although it would be nice if it was validated prior to running fits, especially the segfault with the low bin-construct hist sample count which could be probably prevented by requiring some minimum reasonable value for that parameter). I close this off for now.

@github-actions
Copy link

This issue has been automatically locked since there has not been any recent activity since it was closed. To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues including a reference to this.

@github-actions github-actions bot locked as resolved and limited conversation to collaborators Aug 23, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants