From 8fc2456c872fbff676e72cc28e354f8c12936544 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 24 Apr 2016 16:34:46 +0900 Subject: [PATCH] Enable flake8 --- python-package/setup.py | 14 +- python-package/setup_pip.py | 38 ++--- python-package/xgboost/__init__.py | 2 +- python-package/xgboost/compat.py | 16 +- python-package/xgboost/core.py | 22 ++- python-package/xgboost/libpath.py | 3 +- python-package/xgboost/plotting.py | 2 + python-package/xgboost/rabit.py | 22 +-- python-package/xgboost/sklearn.py | 1 - python-package/xgboost/training.py | 43 +++--- tests/python/test_basic.py | 4 +- tests/python/test_basic_models.py | 166 +++++++++++---------- tests/python/test_early_stopping.py | 3 +- tests/python/test_eval_metrics.py | 5 +- tests/python/test_plotting.py | 1 + tests/python/test_training_continuation.py | 41 +++-- tests/python/test_with_pandas.py | 38 +++-- tests/python/test_with_sklearn.py | 75 ++++++---- tests/travis/run_test.sh | 3 + 19 files changed, 291 insertions(+), 208 deletions(-) diff --git a/python-package/setup.py b/python-package/setup.py index 12bc1a89b589..5b66a097529a 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -4,7 +4,7 @@ import sys import os from setuptools import setup, find_packages -#import subprocess +# import subprocess sys.path.insert(0, '.') CURRENT_DIR = os.path.dirname(__file__) @@ -18,12 +18,12 @@ LIB_PATH = libpath['find_lib_path']() print("Install libxgboost from: %s" % LIB_PATH) -#Please use setup_pip.py for generating and deploying pip installation -#detailed instruction in setup_pip.py +# Please use setup_pip.py for generating and deploying pip installation +# detailed instruction in setup_pip.py setup(name='xgboost', version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(), - #version='0.4a23', - description = "XGBoost Python Package", + # version='0.4a23', + description="XGBoost Python Package", long_description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(), install_requires=[ 'numpy', @@ -33,8 +33,8 @@ maintainer_email='phunter.lau@gmail.com', zip_safe=False, packages=find_packages(), - #this will use MANIFEST.in during install where we specify additional files, - #this is the golden line + # this will use MANIFEST.in during install where we specify additional files, + # this is the golden line include_package_data=True, data_files=[('xgboost', LIB_PATH)], url='https://github.com/dmlc/xgboost') diff --git a/python-package/setup_pip.py b/python-package/setup_pip.py index a6a1638e6c0e..b5aae4d0ccd9 100644 --- a/python-package/setup_pip.py +++ b/python-package/setup_pip.py @@ -4,14 +4,14 @@ import sys import os from setuptools import setup, find_packages -#import subprocess +# import subprocess sys.path.insert(0, '.') -#this script is for packing and shipping pip installation -#it builds xgboost code on the fly and packs for pip -#please don't use this file for installing from github +# this script is for packing and shipping pip installation +# it builds xgboost code on the fly and packs for pip +# please don't use this file for installing from github -if os.name != 'nt': #if not windows, compile and install +if os.name != 'nt': # if not windows, compile and install os.system('sh ./xgboost/build-python.sh') else: print('Windows users please use github installation.') @@ -28,12 +28,12 @@ LIB_PATH = libpath['find_lib_path']() -#to deploy to pip, please use -#make pythonpack -#python setup.py register sdist upload -#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest" +# to deploy to pip, please use +# make pythonpack +# python setup.py register sdist upload +# and be sure to test it firstly using "python setup.py register sdist upload -r pypitest" setup(name='xgboost', - #version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(), + # version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(), version='0.4a30', description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(), install_requires=[ @@ -44,15 +44,15 @@ maintainer_email='phunter.lau@gmail.com', zip_safe=False, packages=find_packages(), - #don't need this and don't use this, give everything to MANIFEST.in - #package_dir = {'':'xgboost'}, - #package_data = {'': ['*.txt','*.md','*.sh'], + # don't need this and don't use this, give everything to MANIFEST.in + # package_dir = {'':'xgboost'}, + # package_data = {'': ['*.txt','*.md','*.sh'], # } - #this will use MANIFEST.in during install where we specify additional files, - #this is the golden line + # this will use MANIFEST.in during install where we specify additional files, + # this is the golden line include_package_data=True, - #!!! don't use data_files for creating pip installation, - #otherwise install_data process will copy it to - #root directory for some machines, and cause confusions on building - #data_files=[('xgboost', LIB_PATH)], + # !!! don't use data_files for creating pip installation, + # otherwise install_data process will copy it to + # root directory for some machines, and cause confusions on building + # data_files=[('xgboost', LIB_PATH)], url='https://github.com/dmlc/xgboost') diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index 304e72355175..2abd10eed2e8 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -10,7 +10,7 @@ from .core import DMatrix, Booster from .training import train, cv -from . import rabit +from . import rabit # noqa try: from .sklearn import XGBModel, XGBClassifier, XGBRegressor from .plotting import plot_importance, plot_tree, to_graphviz diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 81234df4a80d..671090968e7b 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -12,11 +12,21 @@ if PY3: # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = str, - py_str = lambda x: x.decode('utf-8') + + def py_str(x): + return x.decode('utf-8') else: # pylint: disable=invalid-name STRING_TYPES = basestring, - py_str = lambda x: x + + def py_str(x): + return x + +try: + import cPickle as pickle # noqa +except ImportError: + import pickle # noqa + # pandas try: @@ -34,7 +44,7 @@ class DataFrame(object): try: from sklearn.base import BaseEstimator from sklearn.base import RegressorMixin, ClassifierMixin - from sklearn.preprocessing import LabelEncoder + from sklearn.preprocessing import LabelEncoder # noqa from sklearn.cross_validation import KFold, StratifiedKFold SKLEARN_INSTALLED = True diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 80d171326dd6..db7c50699704 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -14,6 +14,7 @@ from .compat import STRING_TYPES, PY3, DataFrame, py_str + class XGBoostError(Exception): """Error throwed by xgboost trainer.""" pass @@ -82,6 +83,7 @@ def _load_lib(): # load the XGBoost library globally _LIB = _load_lib() + def _check_call(ret): """Check the return value of C API call @@ -129,7 +131,6 @@ def c_array(ctype, values): return (ctype * len(values))(*values) - PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'float32': 'float', 'float64': 'float', @@ -144,8 +145,12 @@ def _maybe_pandas_data(data, feature_names, feature_types): data_dtypes = data.dtypes if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): - bad_fields = [data.columns[i] for i, dtype in enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER ] - raise ValueError('DataFrame.dtypes for data must be int, float or bool.\nDid not expect the data types in fie lds '+', '.join(bad_fields)) + bad_fields = [data.columns[i] for i, dtype in + enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER] + + msg = """DataFrame.dtypes for data must be int, float or bool. +Did not expect the data types in fields """ + raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None: feature_names = data.columns.format() @@ -174,6 +179,7 @@ def _maybe_pandas_label(label): return label + class DMatrix(object): """Data Matrix used in XGBoost. @@ -1041,8 +1047,14 @@ def _validate_features(self, data): if self.feature_names != data.feature_names: dat_missing = set(self.feature_names) - set(data.feature_names) my_missing = set(data.feature_names) - set(self.feature_names) + msg = 'feature_names mismatch: {0} {1}' - if dat_missing: msg +='\nexpected ' + ', '.join(str(s) for s in dat_missing) +' in input data' - if my_missing: msg +='\ntraining data did not have the following fields: ' + ', '.join(str(s) for s in my_missing) + + if dat_missing: + msg += '\nexpected ' + ', '.join(str(s) for s in dat_missing) + ' in input data' + + if my_missing: + msg += '\ntraining data did not have the following fields: ' + ', '.join(str(s) for s in my_missing) + raise ValueError(msg.format(self.feature_names, data.feature_names)) diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py index a703dcd7b44c..2d69bda4d45c 100644 --- a/python-package/xgboost/libpath.py +++ b/python-package/xgboost/libpath.py @@ -36,7 +36,8 @@ def find_lib_path(): else: dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - #From github issues, most of installation errors come from machines w/o compilers + + # From github issues, most of installation errors come from machines w/o compilers if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): raise XGBoostLibraryNotFound( 'Cannot find XGBoost Libarary in the candicate path, ' + diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index e57ff77e9ed5..0a70799ad5d5 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -10,6 +10,7 @@ from .core import Booster from .sklearn import XGBModel + def plot_importance(booster, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', @@ -105,6 +106,7 @@ def plot_importance(booster, ax=None, height=0.2, _EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)') _EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)') + def _parse_node(graph, text): """parse dumped node""" match = _NODEPAT.match(text) diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py index ae2426a1a203..af85b2dd043a 100644 --- a/python-package/xgboost/rabit.py +++ b/python-package/xgboost/rabit.py @@ -1,11 +1,12 @@ """Distributed XGBoost Rabit related API.""" from __future__ import absolute_import import sys -import atexit import ctypes import numpy as np from .core import _LIB, c_str, STRING_TYPES +from .compat import pickle + def _init_rabit(): """internal libary initializer.""" @@ -15,6 +16,7 @@ def _init_rabit(): _LIB.RabitIsDistributed.restype = ctypes.c_int _LIB.RabitVersionNumber.restype = ctypes.c_int + def init(args=None): """Initialize the rabit libary with arguments""" if args is None: @@ -73,6 +75,7 @@ def tracker_print(msg): sys.stdout.write(msg) sys.stdout.flush() + def get_processor_name(): """Get the processor name. @@ -127,14 +130,14 @@ def broadcast(data, root): # enumeration of dtypes DTYPE_ENUM__ = { - np.dtype('int8') : 0, - np.dtype('uint8') : 1, - np.dtype('int32') : 2, - np.dtype('uint32') : 3, - np.dtype('int64') : 4, - np.dtype('uint64') : 5, - np.dtype('float32') : 6, - np.dtype('float64') : 7 + np.dtype('int8'): 0, + np.dtype('uint8'): 1, + np.dtype('int32'): 2, + np.dtype('uint32'): 3, + np.dtype('int64'): 4, + np.dtype('uint64'): 5, + np.dtype('float32'): 6, + np.dtype('float64'): 7 } @@ -175,6 +178,7 @@ def allreduce(data, op, prepare_fun=None): op, None, None) else: func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p) + def pfunc(args): """prepare function.""" prepare_fun(data) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 2e8157ff6f86..0a1d8d8e2174 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -366,7 +366,6 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) - xgb_options = self.get_xgb_params() if callable(self.objective): diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 929a79f22e17..2bfb2b0fb1f8 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -6,12 +6,12 @@ import sys import re -import os import numpy as np -from .core import Booster, STRING_TYPES -from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold) +from .core import Booster, STRING_TYPES, XGBoostError +from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold) from . import rabit + def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, xgb_model=None): @@ -97,7 +97,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, verbose_eval = True if verbose_eval_every_line > 0 else False if rabit.get_rank() != 0: - verbose_eval = False; + verbose_eval = False if xgb_model is not None: if not isinstance(xgb_model, STRING_TYPES): @@ -135,8 +135,9 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, if isinstance(params, list): if len(params) != len(dict(params).items()): params = dict(params) - rabit.tracker_print("Multiple eval metrics have been passed: " \ - "'{0}' will be used for early stopping.\n\n".format(params['eval_metric'])) + msg = ("Multiple eval metrics have been passed: " + "'{0}' will be used for early stopping.\n\n") + rabit.tracker_print(msg.format(params['eval_metric'])) else: params = dict(params) @@ -173,7 +174,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, # Distributed code: need to resume to this point. # Skip the first update if it is a recovery step. - if version % 2 == 0: + if version % 2 == 0: bst.update(dtrain, i, obj) bst.save_rabit_checkpoint() version += 1 @@ -203,7 +204,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, evals_idx = evals_name.index(key) res_per_eval = len(res) // len(evals_name) for r in range(res_per_eval): - res_item = res[(evals_idx*res_per_eval) + r] + res_item = res[(evals_idx * res_per_eval) + r] res_key = res_item[0] res_val = res_item[1] if res_key in evals_result[key]: @@ -224,7 +225,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, elif i - best_iteration >= early_stopping_rounds: best_msg = bst.attr('best_msg') if verbose_eval: - rabit.tracker_print("Stopping. Best iteration:\n{}\n\n".format(best_msg)) + msg = "Stopping. Best iteration:\n{}\n\n" + rabit.tracker_print(msg.format(best_msg)) break # do checkpoint after evaluation, in case evaluation also updates booster. bst.save_rabit_checkpoint() @@ -290,6 +292,7 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, ret.append(CVPack(dtrain, dtest, plst)) return ret + def aggcv(rlist, show_stdv=True, verbose_eval=None, as_pandas=True, trial=0): # pylint: disable=invalid-name """ @@ -405,8 +408,8 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None ------- evaluation history : list(string) """ - if stratified == True and not SKLEARN_INSTALLED: - raise XGBoostError('sklearn needs to be installed in order to use stratified cv') + if stratified is True and not SKLEARN_INSTALLED: + raise XGBoostError('sklearn needs to be installed in order to use stratified cv') if isinstance(metrics, str): metrics = [metrics] @@ -417,7 +420,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None if 'eval_metric' in params: params['eval_metric'] = _metrics else: - params= dict((k, v) for k, v in params.items()) + params = dict((k, v) for k, v in params.items()) if len(metrics) == 0 and 'eval_metric' in params: if isinstance(params['eval_metric'], list): @@ -428,12 +431,14 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None params.pop("eval_metric", None) if early_stopping_rounds is not None: + if len(metrics) > 1: - raise ValueError('Check your params. '\ - 'Early stopping works with single eval metric only.') + msg = ('Check your params. ' + 'Early stopping works with single eval metric only.') + raise ValueError(msg) if verbose_eval: - sys.stderr.write("Will train until cv error hasn't decreased in {} rounds.\n".format(\ - early_stopping_rounds)) + msg = "Will train until cv error hasn't decreased in {} rounds.\n" + sys.stderr.write(msg.format(early_stopping_rounds)) maximize_score = False if len(metrics) == 1: @@ -466,10 +471,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None best_score = score best_score_i = i elif i - best_score_i >= early_stopping_rounds: - results = results[:best_score_i+1] + results = results[:best_score_i + 1] if verbose_eval: - sys.stderr.write("Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n". - format(best_score_i, results[-1][0], results[-1][1])) + msg = "Stopping. Best iteration:\n[{}] cv-mean:{}\tcv-std:{}\n" + sys.stderr.write(msg.format(best_score_i, results[-1][0], results[-1][1])) break if as_pandas: try: diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index ed18f1aad9ef..6a342da40b47 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -8,6 +8,7 @@ class TestBasic(unittest.TestCase): + def test_basic(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') @@ -37,7 +38,7 @@ def test_basic(self): def test_multiclass(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') - param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class' : 2} + param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class': 2} # specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] num_round = 2 @@ -60,7 +61,6 @@ def test_multiclass(self): # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 - def test_dmatrix_init(self): data = np.random.randn(5, 5) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index ffcb8805a110..8895692dad36 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -8,82 +8,94 @@ rng = np.random.RandomState(1994) + class TestModels(unittest.TestCase): - def test_glm(self): - param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 } - watchlist = [(dtest,'eval'), (dtrain,'train')] - num_round = 4 - bst = xgb.train(param, dtrain, num_round, watchlist) - assert isinstance(bst, xgb.core.Booster) - preds = bst.predict(dtest) - labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - assert err < 0.1 - - def test_eta_decay(self): - param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } - watchlist = [(dtest,'eval'), (dtrain,'train')] - num_round = 2 - # learning_rates as a list - bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.4, 0.3]) - assert isinstance(bst, xgb.core.Booster) - - # learning_rates as a customized decay function - def eta_decay(ithround, num_boost_round): - return num_boost_round / (ithround + 1) - bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay) - assert isinstance(bst, xgb.core.Booster) - - - def test_custom_objective(self): - param = {'max_depth':2, 'eta':1, 'silent':1 } - watchlist = [(dtest,'eval'), (dtrain,'train')] - num_round = 2 - def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0-preds) - return grad, hess - def evalerror(preds, dtrain): - labels = dtrain.get_label() - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) - - # test custom_objective in training - bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) - assert isinstance(bst, xgb.core.Booster) - preds = bst.predict(dtest) - labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - assert err < 0.1 - - # test custom_objective in cross-validation - xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, - obj = logregobj, feval=evalerror) - - # test maximize parameter - def neg_evalerror(preds, dtrain): - labels = dtrain.get_label() - return 'error', float(sum(labels == (preds > 0.0))) / len(labels) - bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True) - preds2 = bst2.predict(dtest) - err2 = sum(1 for i in range(len(preds2)) if int(preds2[i]>0.5)!=labels[i]) / float(len(preds2)) - assert err == err2 - - def test_fpreproc(self): - param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} - num_round = 2 - def fpreproc(dtrain, dtest, param): - label = dtrain.get_label() - ratio = float(np.sum(label == 0)) / np.sum(label==1) - param['scale_pos_weight'] = ratio - return (dtrain, dtest, param) - xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'auc'}, seed = 0, fpreproc = fpreproc) - - def test_show_stdv(self): - param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} - num_round = 2 - xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'error'}, seed = 0, show_stdv = False) + def test_glm(self): + param = {'silent': 1, 'objective': 'binary:logistic', + 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1} + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + num_round = 4 + bst = xgb.train(param, dtrain, num_round, watchlist) + assert isinstance(bst, xgb.core.Booster) + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.1 + + def test_eta_decay(self): + param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + num_round = 2 + # learning_rates as a list + bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.4, 0.3]) + assert isinstance(bst, xgb.core.Booster) + + # learning_rates as a customized decay function + def eta_decay(ithround, num_boost_round): + return num_boost_round / (ithround + 1) + + bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay) + assert isinstance(bst, xgb.core.Booster) + + def test_custom_objective(self): + param = {'max_depth': 2, 'eta': 1, 'silent': 1} + watchlist = [(dtest, 'eval'), (dtrain, 'train')] + num_round = 2 + + def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0 - preds) + return grad, hess + + def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + + # test custom_objective in training + bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) + assert isinstance(bst, xgb.core.Booster) + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.1 + + # test custom_objective in cross-validation + xgb.cv(param, dtrain, num_round, nfold=5, seed=0, + obj=logregobj, feval=evalerror) + + # test maximize parameter + def neg_evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels == (preds > 0.0))) / len(labels) + + bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True) + preds2 = bst2.predict(dtest) + err2 = sum(1 for i in range(len(preds2)) + if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2)) + assert err == err2 + + def test_fpreproc(self): + param = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic'} + num_round = 2 + + def fpreproc(dtrain, dtest, param): + label = dtrain.get_label() + ratio = float(np.sum(label == 0)) / np.sum(label == 1) + param['scale_pos_weight'] = ratio + return (dtrain, dtest, param) + + xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'auc'}, seed=0, fpreproc=fpreproc) + + def test_show_stdv(self): + param = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic'} + num_round = 2 + xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed=0, show_stdv=False) diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index 6d1895fb1d63..994ae3dde7d1 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -1,7 +1,7 @@ import xgboost as xgb import numpy as np from sklearn.datasets import load_digits -from sklearn.cross_validation import KFold, train_test_split +from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error import unittest @@ -40,7 +40,6 @@ def test_cv_early_stopping(self): dm = xgb.DMatrix(X, label=y) params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - import pandas as pd cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) assert cv.shape[0] == 10 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5) diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py index 190851dae5e9..d7691655798f 100644 --- a/tests/python/test_eval_metrics.py +++ b/tests/python/test_eval_metrics.py @@ -1,9 +1,8 @@ import xgboost as xgb import numpy as np -from sklearn.cross_validation import KFold, train_test_split +from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error -from sklearn.grid_search import GridSearchCV -from sklearn.datasets import load_iris, load_digits, load_boston +from sklearn.datasets import load_digits import unittest rng = np.random.RandomState(1337) diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py index 7f6123bce6e4..20b62d2b3baf 100644 --- a/tests/python/test_plotting.py +++ b/tests/python/test_plotting.py @@ -12,6 +12,7 @@ dpath = 'demo/data/' rng = np.random.RandomState(1994) + class TestPlotting(unittest.TestCase): def test_plotting(self): bst2 = xgb.Booster(model_file='xgb.model') diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py index ac6deca264f1..21da9df1da55 100644 --- a/tests/python/test_training_continuation.py +++ b/tests/python/test_training_continuation.py @@ -1,10 +1,7 @@ import xgboost as xgb import numpy as np -from sklearn.preprocessing import MultiLabelBinarizer -from sklearn.cross_validation import KFold, train_test_split from sklearn.metrics import mean_squared_error -from sklearn.grid_search import GridSearchCV -from sklearn.datasets import load_iris, load_digits, load_boston +from sklearn.datasets import load_digits import unittest rng = np.random.RandomState(1337) @@ -57,10 +54,14 @@ def test_training_continuation(self): ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 - assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \ - mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) - assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \ - mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) + + res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) + res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) + assert res1 == res2 + + res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) + res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) + assert res1 == res2 gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') @@ -71,22 +72,30 @@ def test_training_continuation(self): ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 - assert mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) == \ - mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) + + res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) + res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) + assert res1 == res2 gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree - assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \ - mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) + + res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) + res2 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) + assert res1 == res2 gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree - assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \ - mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) + + res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) + res2 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) + assert res1 == res2 gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree - assert np.any(gbdt_05.predict(dtrain_5class) != - gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)) == False + + res1 = gbdt_05.predict(dtrain_5class) + res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit) + np.testing.assert_almost_equal(res1, res2) diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index f5ceb6fc2104..f23e2b94604f 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -111,43 +111,55 @@ def test_cv_as_pandas(self): u'train-error-mean', u'train-error-std']) assert cv.columns.equals(exp) - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc'} + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic', 'eval_metric': 'auc'} cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) assert 'eval_metric' in params assert 'auc' in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic', 'eval_metric': ['auc']} cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) assert 'eval_metric' in params assert 'auc' in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, early_stopping_rounds=1) + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic', 'eval_metric': ['auc']} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, early_stopping_rounds=1) assert 'eval_metric' in params assert 'auc' in cv.columns[0] assert cv.shape[0] < 10 - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='auc') + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic'} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, metrics='auc') assert 'auc' in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['auc']) + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic'} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, metrics=['auc']) assert 'auc' in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='error') + params = {'max_depth': 2, 'eta': 1, 'silent': 1, + 'objective': 'binary:logistic', 'eval_metric': ['auc']} + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, metrics='error') assert 'eval_metric' in params assert 'auc' not in cv.columns[0] assert 'error' in cv.columns[0] - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, metrics=['error']) assert 'eval_metric' in params assert 'auc' not in cv.columns[0] assert 'error' in cv.columns[0] params = list(params.items()) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + as_pandas=True, metrics=['error']) assert isinstance(params, list) assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] \ No newline at end of file + assert 'error' in cv.columns[0] diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 161fe30a607e..716feee2b6fa 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,6 +1,5 @@ import xgboost as xgb import numpy as np -from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error from sklearn.grid_search import GridSearchCV from sklearn.datasets import load_iris, load_digits, load_boston @@ -8,33 +7,46 @@ rng = np.random.RandomState(1994) + def test_binary_classification(): digits = load_digits(2) y = digits['target'] X = digits['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - assert err < 0.1 + err = sum(1 for i in range(len(preds)) + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.1 + def test_multiclass_classification(): + + def check_pred(preds, labels): + err = sum(1 for i in range(len(preds)) + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.4 + iris = load_iris() y = iris['target'] X = iris['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index]) + xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - assert err < 0.4 + + check_pred(preds, labels) + check_pred(preds2, labels) + check_pred(preds3, labels) + check_pred(preds4, labels) + def test_boston_housing_regression(): boston = load_boston() @@ -42,27 +54,33 @@ def test_boston_housing_regression(): X = boston['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: - xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index]) + xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) + preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] - assert mean_squared_error(preds, labels) < 25 + + assert mean_squared_error(preds, labels) < 25 + assert mean_squared_error(preds2, labels) < 350 + assert mean_squared_error(preds3, labels) < 25 + assert mean_squared_error(preds4, labels) < 350 + def test_parameter_tuning(): boston = load_boston() y = boston['target'] X = boston['data'] xgb_model = xgb.XGBRegressor() - clf = GridSearchCV(xgb_model, - {'max_depth': [2,4,6], - 'n_estimators': [50,100,200]}, verbose=1) - clf.fit(X,y) + clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6], + 'n_estimators': [50, 100, 200]}, verbose=1) + clf.fit(X, y) assert clf.best_score_ < 0.7 assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4} + def test_regression_with_custom_objective(): def objective_ls(y_true, y_pred): grad = (y_pred - y_true) @@ -86,20 +104,17 @@ class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_pred): - raise XGBCustomObjectiveException() + raise XGBCustomObjectiveException() xgb_model = xgb.XGBRegressor(objective=dummy_objective) - np.testing.assert_raises( - XGBCustomObjectiveException, - xgb_model.fit, - X, y - ) + np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y) + def test_classification_with_custom_objective(): def logregobj(y_true, y_pred): y_pred = 1.0 / (1.0 + np.exp(-y_pred)) grad = y_pred - y_true - hess = y_pred * (1.0-y_pred) + hess = y_pred * (1.0 - y_pred) return grad, hess digits = load_digits(2) @@ -107,22 +122,20 @@ def logregobj(y_true, y_pred): X = digits['data'] kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: - xgb_model = xgb.XGBClassifier(objective=logregobj).fit( - X[train_index],y[train_index] - ) + xgb_model = xgb.XGBClassifier(objective=logregobj) + xgb_model.fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] err = sum(1 for i in range(len(preds)) - if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - assert err < 0.1 - + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.1 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_preds): - raise XGBCustomObjectiveException() + raise XGBCustomObjectiveException() xgb_model = xgb.XGBClassifier(objective=dummy_objective) np.testing.assert_raises( @@ -131,6 +144,7 @@ def dummy_objective(y_true, y_preds): X, y ) + def test_sklearn_api(): iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120) @@ -143,6 +157,7 @@ def test_sklearn_api(): err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l) assert err < 0.2 + def test_sklearn_plotting(): iris = load_iris() @@ -168,12 +183,13 @@ def test_sklearn_plotting(): ax = xgb.plot_tree(classifier, num_trees=0) assert isinstance(ax, Axes) + def test_sklearn_nfolds_cv(): digits = load_digits(3) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) - + params = { 'max_depth': 2, 'eta': 1, @@ -187,9 +203,8 @@ def test_sklearn_nfolds_cv(): nfolds = 5 skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed) - import pandas as pd cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed) cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] - assert cv2.iloc[-1,0] == cv3.iloc[-1,0] \ No newline at end of file + assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0] diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 482dcafdc96b..753523737471 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -52,6 +52,9 @@ if [ ${TASK} == "python_lightweight_test" ]; then conda install numpy scipy nose python -m pip install graphviz python -m nose tests/python/test_basic*.py || exit -1 + python -m pip install flake8 + flake8 --ignore E501 python-package || exit -1 + flake8 --ignore E501 tests/python || exit -1 exit 0 fi