Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

aldanor · 2020-11-27T12:39:48Z

Environment setup:

linux-64
numpy=1.18
lightgbm 3.0.0 or 3.1.0 (crashes in both cases), conda-forge version or built manually with clang/iomp5 (crashes in both cases)

(Should lightgbm run this kind of testing on CI to catch these sort of errors as early as possible?)

Test script (click to expand)

from dataclasses import dataclass
import random
from typing import Any, Dict, List, Final, Optional

import lightgbm as lgb
import numpy as np
from sklearn.datasets import make_classification


MIN_FOREST_DEPTH: Final[int] = 1
MAX_FOREST_DEPTH: Final[int] = 8


def make_lightgbm_params(
    params: Dict[str, Any],
    mono: Optional[np.ndarray],
    depth: int,
    num_trees: int,
    n_jobs: int,
    seed: int,
) -> Dict[str, Any]:
    assert MIN_FOREST_DEPTH <= depth <= MAX_FOREST_DEPTH
    params = dict(params)
    assert num_trees >= 1
    params['objective'] = 'binary'
    params['tree_learner'] = 'serial'
    params['num_leaves'] = 2 ** depth
    params['verbosity'] = -1
    params['num_threads'] = n_jobs
    params['seed'] = seed
    params['histogram_pool_size'] = -1
    params['max_depth'] = depth
    if mono is not None and (mono != 0).any():
        params['monotone_constraints'] = mono.tolist()
    return params


@dataclass
class TrainData:
    signals: np.ndarray
    labels: np.ndarray
    feature_names: List[str]
    mono: Dict[str, int]

    @property
    def n_features(self) -> int:
        return int(self.signals.shape[1])

    @property
    def mono_arr(self) -> np.ndarray:
        return np.array([self.mono.get(f, 0) for f in self.feature_names]).astype(np.int8)

    @classmethod
    def generate(cls, random_state: int, dtype: type):
        state = random.getstate()
        random.seed(random_state)
        n_samples = random.randint(100, 10_000)
        n_features = random.randint(10, 40)
        ds = make_classification(n_samples, n_features, random_state=random_state)
        signals = np.clip((ds[0] * 150).astype(dtype).round(), -511, 511)
        labels = ds[1] > 0
        feature_names = list(map('f{:03d}'.format, range(n_features)))
        if random_state % 2 == 0:
            mono = {}
        else:
            mono = {f: (i % 3) - 1 for i, f in enumerate(feature_names)}
            if random_state % 4 == 0:
                mono['x'] = 1
        random.setstate(state)
        return cls(
            signals=signals,
            labels=labels,
            feature_names=feature_names,
            mono=mono,
        )


def generate_lightgbm_params(random_state: int, depth: int) -> Dict[str, Any]:
    state = random.getstate()
    random.seed(random_state)
    params_pool = dict(
        boosting=['gbdt', 'rf', 'dart', 'goss'],
        learning_rate=[1e-6, 0.01, 0.1],
        min_data_in_leaf=[0, 10],
        min_sum_hessian_in_leaf=[0.0, 0.01],
        bagging_fraction=[0.75, 1.0],
        pos_bagging_fraction=[0.75, 1.0],
        neg_bagging_fraction=[0.75, 1.0],
        bagging_freq=[0, 2],
        feature_fraction=[0.75, 1.0],
        feature_fraction_bynode=[0.75, 1.0],
        max_delta_step=[0.0, 0.01],
        lambda_l1=[0.0, 0.01],
        lambda_l2=[0.0, 0.01],
        min_gain_to_split=[0.0, 0.01],
        drop_rate=[0.0, 0.5, 1.0],
        max_drop=[0, 100],
        skip_drop=[0.0, 0.5, 1.0],
        xgboost_dart_mode=[True, False],
        uniform_drop=[True, False],
        refit_decay_rate=[0.0, 0.5, 1.0],
        max_bin=[2, 10, 255],
        min_data_in_bin=[1, 25],
        bin_construct_sample_cnt=[1, 50],
        is_unbalance=[True, False],
        scale_pos_weight=[0.01, 1.0, 1e2],
        sigmoid=[0.01, 0.5],
        boost_from_average=[True, False],
        #mc_method=['basic', 'intermediate', 'advanced'],
        #mc_penalty=[0.0, 7.0],
    )
    params = {
        k: v[random.randint(0, len(v) - 1)]
        for k, v in params_pool.items()
        if random.random() >= 0.1
    }
    if params.get('min_data_in_leaf', 20) == 0:
        if params.get('min_sum_hessian_in_leaf', 1e-3) == 0:
            params['min_data_in_leaf'] = 1
    if params.get('is_unbalance') and 'scale_pos_weight' in params:
        params['scale_pos_weight'] = 1.0
    top_rate, other_rate = random.choice([(-1, 0.8), (-1, 0.01), (0.9, -1), (0.01, -1), (-1, -1)])
    if top_rate != -1:
        params['top_rate'] = top_rate
    if other_rate != -1:
        params['other_rate'] = other_rate
    if params.get('boosting') == 'goss':
        if 'bagging_fraction' in params:
            params['bagging_fraction'] = 1.0
    if params.get('boosting') == 'rf':
        if params.get('bagging_freq', 0) == 0:
            params['bagging_freq'] = 2
        if params.get('bagging_fraction', 1.0) == 1.0:
            params['bagging_fraction'] = 0.5
    if params.get('mc_method', 'basic') != 'basic':
        if params.get('feature_fraction', 1) != 1:
            params['feature_fraction'] = 1
        if params.get('feature_fraction_bynode', 1) != 1:
            params['feature_fraction_bynode'] = 1
    if params.get('mc_penalty'):
        params['mc_penalty'] = min(params['mc_penalty'], float(depth) - 0.01)
    random.setstate(state)
    return params


def test_lightgbm(random_state: int):
    random.seed(random_state)
    depth = random.randint(MIN_FOREST_DEPTH, MAX_FOREST_DEPTH)
    num_trees = random.randint(1, 50)

    d = TrainData.generate(random_state, np.float32)
    params = {} if random_state == 0 else generate_lightgbm_params(random_state, depth)

    params = make_lightgbm_params(params, d.mono_arr, depth, num_trees, 1, 42)
    lgb_kwargs = dict(
        params=params,
        train_set=lgb.Dataset(d.signals, d.labels),
        num_boost_round=num_trees,
        verbose_eval=False,
    )

    print(f'[#{i}]')
    print(f'{d.signals.shape=}, {depth=}, {num_trees=}')
    print(params)
    print(flush=True)
    lgb.train(**lgb_kwargs)


if __name__ == '__main__':
    for i in range(1000):
        test_lightgbm(i)

On lightgbm 2.3.1, the script finishes successfully.

Starting from version 3.0.0, it fails after a few iterations like this (the one below is from 3.1.0):

[#6]
d.signals.shape=(9501, 36), depth=2, num_trees=32
{'boosting': 'gbdt',
 'learning_rate': 0.01,
 'min_sum_hessian_in_leaf': 0.01,
 'bagging_fraction': 1.0,
 'pos_bagging_fraction': 0.75,
 'neg_bagging_fraction': 0.75,
 'bagging_freq': 2,
 'feature_fraction': 0.75,
 'feature_fraction_bynode': 1.0,
 'max_delta_step': 0.0,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'min_gain_to_split': 0.01,
 'drop_rate': 0.0,
 'max_drop': 100,
 'uniform_drop': False,
 'refit_decay_rate': 0.0,
 'max_bin': 255,
 'min_data_in_bin': 1,
 'bin_construct_sample_cnt': 1,
 'is_unbalance': False,
 'scale_pos_weight': 0.01,
 'boost_from_average': True,
 'other_rate': 0.8,
 'objective': 'binary',
 'tree_learner': 'serial',
 'num_leaves': 4,
 'verbosity': -1,
 'num_threads': 1,
 'seed': 42,
 'histogram_pool_size': -1,
 'max_depth': 2}


[LightGBM] [Fatal] Check failed: (best_split_info.right_count) > (0) 
    at /tmp/pip-req-build-kncfj4kn/compile/src/treelearner/serial_tree_learner.cpp, line 661 .

“python test_forest.py” terminated by signal SIGSEGV (Address boundary error)

By altering the generator a bit, you can also get a segfault from malloc about corrupted linked lists etc.

The text was updated successfully, but these errors were encountered:

aldanor · 2020-11-27T13:00:58Z

If you run this through gdb, you get this: so, a double free in Dataset? (although this may not be the source of the problem but rather a consequence)

*** Error in `./env/bin/python': double free or corruption (out): 0x00005555567151d0 ***

#5  0x00007fffd3d61dd9 in LightGBM::Dataset::~Dataset() ()
   from ./env/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so

guolinke · 2020-11-27T15:39:14Z

I guess bin_construct_sample_cnt=1 is the root cause. We alreay add warnings about this.

guolinke · 2020-11-27T15:41:31Z

Refer #3521

aldanor · 2020-11-27T15:51:36Z

I'll try rerunning the example and report back. What's the minimum sensible value, something like 257 so it never chooses int8? That's not clear. Or actually, hold on, it's not well-defined, because it has to be 257 unique values observed, not the sample count.

Wonder if it would be possible to throw a preliminary exception (saying 'this has to be > ...') instead of a segfault?

aldanor · 2020-11-27T15:58:46Z

I've updated the script to use [10_000, 200_000] for bin_construct_sample_cnt. It passed a hundred cases and then fails the same way on these parameters (note that bin_construct_sample_cnt = 200_000 here, is that be cause the dataset is just 5.4k samples?...):

[#150]
d.signals.shape=(5447, 38), depth=6, num_trees=26
{'boosting': 'goss',
 'learning_rate': 0.01,
 'min_data_in_leaf': 1,
 'min_sum_hessian_in_leaf': 0.0,
 'bagging_fraction': 1.0,
 'pos_bagging_fraction': 1.0,
 'neg_bagging_fraction': 1.0,
 'feature_fraction': 0.75,
 'feature_fraction_bynode': 0.75,
 'max_delta_step': 0.0,
 'lambda_l1': 0.01,
 'lambda_l2': 0.0,
 'drop_rate': 1.0,
 'max_drop': 100,
 'skip_drop': 1.0,
 'xgboost_dart_mode': True,
 'uniform_drop': False,
 'refit_decay_rate': 0.0,
 'max_bin': 10,
 'min_data_in_bin': 25,
 'bin_construct_sample_cnt': 200000,
 'is_unbalance': True,
 'scale_pos_weight': 1.0,
 'sigmoid': 0.01,
 'other_rate': 0.8,
 'objective': 'binary',
 'tree_learner': 'serial',
 'num_leaves': 64,
 'verbosity': -1,
 'num_threads': 1,
 'seed': 42,
 'histogram_pool_size': -1,
 'max_depth': 6}


[LightGBM] [Fatal] Check failed: (best_split_info.left_count) > (0) 
    at /tmp/pip-req-build-kncfj4kn/compile/src/treelearner/serial_tree_learner.cpp, line 651 .

aldanor · 2020-11-27T18:08:55Z

@guolinke Any thoughts on the last case? (is that another implicit consequence of histogram binning changes, or something else?)

guolinke · 2020-11-28T01:19:44Z

@aldanor we remove the count histogram, and use hessian to estimate count.
Therefore, for the non-const hessian objective (like binary you use), with very small num_data_per_leaf (like 1 you used, and min_sum_hessian_in_leaf=0), the count could be estimated incorrectly, and cause the check fail.

We suggest to use larger num_data_per_leaf, and non-zero min_sum_hessian_in_leaf, to reduce the chance of estimation error.

aldanor · 2020-11-28T02:32:50Z

Thanks. So does that mean num_data_per_leaf should be avoided now in general? (I'll try rerunning this later to see if it works)

guolinke · 2020-11-29T01:21:13Z

@aldanor avoid to set it num_data_per_leaf to small values (e.g. smaller than 10), or avoid set min_sum_hessian_in_leaf to zero.

aldanor · 2020-11-30T11:01:57Z

@guolinke Thanks, that seems to have fixed it (although it would be nice if it was validated prior to running fits, especially the segfault with the low bin-construct hist sample count which could be probably prevented by requiring some minimum reasonable value for that parameter). I close this off for now.

github-actions · 2023-08-23T19:14:59Z

This issue has been automatically locked since there has not been any recent activity since it was closed. To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues including a reference to this.

aldanor closed this as completed Nov 30, 2020

ch3rn0v mentioned this issue Dec 25, 2020

Avoiding Exception "Check failed: (best_split_info.right_count) > (0) at ..." with a regression task #3679

Open

github-actions bot locked as resolved and limited conversation to collaborators Aug 23, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020 •

edited

Loading

guolinke commented Nov 27, 2020

guolinke commented Nov 27, 2020

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020

guolinke commented Nov 28, 2020 •

edited

Loading

aldanor commented Nov 28, 2020

guolinke commented Nov 29, 2020

aldanor commented Nov 30, 2020

github-actions bot commented Aug 23, 2023

Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

Important: reproducible segfault from version 3.0.0 onwards (both 3.0.0 and 3.1.0 affected) #3603

Comments

aldanor commented Nov 27, 2020 • edited Loading

aldanor commented Nov 27, 2020 • edited Loading

guolinke commented Nov 27, 2020

guolinke commented Nov 27, 2020

aldanor commented Nov 27, 2020 • edited Loading

aldanor commented Nov 27, 2020 • edited Loading

aldanor commented Nov 27, 2020

guolinke commented Nov 28, 2020 • edited Loading

aldanor commented Nov 28, 2020

guolinke commented Nov 29, 2020

aldanor commented Nov 30, 2020

github-actions bot commented Aug 23, 2023

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020 •

edited

Loading

aldanor commented Nov 27, 2020 •

edited

Loading

guolinke commented Nov 28, 2020 •

edited

Loading