Source code for lightgbm.engine

# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import

import collections
from operator import attrgetter

import numpy as np

from . import callback
from .basic import Booster, Dataset, LightGBMError, _InnerPredictor
from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, LGBMGroupKFold, integer_types,
                     range_, string_type)


[docs]def train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None): """ Train with given parameters. Parameters ---------- params : dict Parameters for training. train_set : Dataset Data to be trained. num_boost_round: int Number of boosting iterations. valid_sets: list of Datasets List of data to be evaluated during training valid_names: list of string Names of valid_sets fobj : function Customized objective function. feval : function Customized evaluation function. Note: should return (eval_name, eval_result, is_higher_better) of list of this init_model : file name of lightgbm model or 'Booster' instance model used for continued train feature_name : list of str, or 'auto' Feature names If 'auto' and data is pandas DataFrame, use data columns name categorical_feature : list of str or int, or 'auto' Categorical features, type int represents index, type str represents feature names (need to specify feature_name as well) If 'auto' and data is pandas DataFrame, use pandas categorical columns early_stopping_rounds: int Activates early stopping. Requires at least one validation data and one metric If there's more than one, will check all of them Returns the model with (best_iter + early_stopping_rounds) If early stopping occurs, the model will add 'best_iteration' field evals_result: dict or None This dictionary used to store all evaluation results of all the items in valid_sets. Example: with a valid_sets containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss') Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]}, 'eval': {'logloss': ['0.480385', '0.357756', ...]}} passed with None means no using this function verbose_eval : bool or int Requires at least one item in evals. If `verbose_eval` is True, the eval metric on the valid set is printed at each boosting stage. If `verbose_eval` is int, the eval metric on the valid set is printed at every `verbose_eval` boosting stage. The last boosting stage or the boosting stage found by using `early_stopping_rounds` is also printed. Example: with verbose_eval=4 and at least one item in evals, an evaluation metric is printed every 4 (instead of 1) boosting stages. learning_rates: list or function List of learning rate for each boosting round or a customized function that calculates learning_rate in terms of current number of round (e.g. yields learning rate decay) - list l: learning_rate = l[current_round] - function f: learning_rate = f(current_round) callbacks : list of callback functions List of callback functions that are applied at each iteration. See Callbacks in Python-API.md for more information. Returns ------- booster : a trained booster model """ """create predictor first""" if isinstance(init_model, string_type): predictor = _InnerPredictor(model_file=init_model) elif isinstance(init_model, Booster): predictor = init_model._to_predictor() else: predictor = None init_iteration = predictor.num_total_iteration if predictor is not None else 0 """check dataset""" if not isinstance(train_set, Dataset): raise TypeError("Training only accepts Dataset object") train_set._update_params(params) train_set._set_predictor(predictor) train_set.set_feature_name(feature_name) train_set.set_categorical_feature(categorical_feature) is_valid_contain_train = False train_data_name = "training" reduced_valid_sets = [] name_valid_sets = [] if valid_sets is not None: if isinstance(valid_sets, Dataset): valid_sets = [valid_sets] if isinstance(valid_names, string_type): valid_names = [valid_names] for i, valid_data in enumerate(valid_sets): """reduce cost for prediction training data""" if valid_data is train_set: is_valid_contain_train = True if valid_names is not None: train_data_name = valid_names[i] continue if not isinstance(valid_data, Dataset): raise TypeError("Traninig only accepts Dataset object") valid_data.set_reference(train_set) reduced_valid_sets.append(valid_data) if valid_names is not None and len(valid_names) > i: name_valid_sets.append(valid_names[i]) else: name_valid_sets.append('valid_' + str(i)) for valid_data in valid_sets: valid_data._update_params(params) """process callbacks""" if callbacks is None: callbacks = set() else: for i, cb in enumerate(callbacks): cb.__dict__.setdefault('order', i - len(callbacks)) callbacks = set(callbacks) # Most of legacy advanced options becomes callbacks if verbose_eval is True: callbacks.add(callback.print_evaluation()) elif isinstance(verbose_eval, integer_types): callbacks.add(callback.print_evaluation(verbose_eval)) if early_stopping_rounds is not None: callbacks.add(callback.early_stopping(early_stopping_rounds, verbose=bool(verbose_eval))) if learning_rates is not None: callbacks.add(callback.reset_parameter(learning_rate=learning_rates)) if evals_result is not None: callbacks.add(callback.record_evaluation(evals_result)) callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)} callbacks_after_iter = callbacks - callbacks_before_iter callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order')) callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order')) """construct booster""" booster = Booster(params=params, train_set=train_set) if is_valid_contain_train: booster.set_train_data_name(train_data_name) for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets): booster.add_valid(valid_set, name_valid_set) booster.best_iteration = 0 """start training""" for i in range_(init_iteration, init_iteration + num_boost_round): for cb in callbacks_before_iter: cb(callback.CallbackEnv(model=booster, params=params, iteration=i, begin_iteration=init_iteration, end_iteration=init_iteration + num_boost_round, evaluation_result_list=None)) booster.update(fobj=fobj) evaluation_result_list = [] # check evaluation result. if valid_sets is not None: if is_valid_contain_train: evaluation_result_list.extend(booster.eval_train(feval)) evaluation_result_list.extend(booster.eval_valid(feval)) try: for cb in callbacks_after_iter: cb(callback.CallbackEnv(model=booster, params=params, iteration=i, begin_iteration=init_iteration, end_iteration=init_iteration + num_boost_round, evaluation_result_list=evaluation_result_list)) except callback.EarlyStopException as earlyStopException: booster.best_iteration = earlyStopException.best_iteration + 1 evaluation_result_list = earlyStopException.best_score break booster.best_score = collections.defaultdict(dict) for dataset_name, eval_name, score, _ in evaluation_result_list: booster.best_score[dataset_name][eval_name] = score return booster
class CVBooster(object): """"Auxiliary data struct to hold all boosters of CV.""" def __init__(self): self.boosters = [] self.best_iteration = -1 def append(self, booster): """add a booster to CVBooster""" self.boosters.append(booster) def __getattr__(self, name): """redirect methods call of CVBooster""" def handlerFunction(*args, **kwargs): """call methods with each booster, and concatenate their results""" ret = [] for booster in self.boosters: ret.append(getattr(booster, name)(*args, **kwargs)) return ret return handlerFunction def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True): """ Make an n-fold list of Booster from random indices. """ full_data = full_data.construct() num_data = full_data.num_data() if folds is not None: if not hasattr(folds, '__iter__'): raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx)") else: if 'objective' in params and params['objective'] == 'lambdarank': if not SKLEARN_INSTALLED: raise LightGBMError('Scikit-learn is required for lambdarank cv.') # lambdarank task, split according to groups group_info = full_data.get_group().astype(int) flatted_group = np.repeat(range(len(group_info)), repeats=group_info) group_kfold = LGBMGroupKFold(n_splits=nfold) folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group) elif stratified: if not SKLEARN_INSTALLED: raise LightGBMError('Scikit-learn is required for stratified cv.') skf = LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) folds = skf.split(X=np.zeros(num_data), y=full_data.get_label()) else: if shuffle: randidx = np.random.RandomState(seed).permutation(num_data) else: randidx = np.arange(num_data) kstep = int(num_data / nfold) test_id = [randidx[i: i + kstep] for i in range_(0, num_data, kstep)] train_id = [np.concatenate([test_id[i] for i in range_(nfold) if k != i]) for k in range_(nfold)] folds = zip(train_id, test_id) ret = CVBooster() for train_idx, test_idx in folds: train_set = full_data.subset(train_idx) valid_set = full_data.subset(test_idx) # run preprocessing on the data set if needed if fpreproc is not None: train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy()) else: tparam = params cvbooster = Booster(tparam, train_set) cvbooster.add_valid(valid_set, 'valid') ret.append(cvbooster) return ret def _agg_cv_result(raw_results): """ Aggregate cross-validation results. """ cvmap = collections.defaultdict(list) metric_type = {} for one_result in raw_results: for one_line in one_result: metric_type[one_line[1]] = one_line[3] cvmap[one_line[1]].append(one_line[2]) return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
[docs]def cv(params, train_set, num_boost_round=10, folds=None, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None): """ Cross-validation with given paramaters. Parameters ---------- params : dict Booster params. train_set : Dataset Data to be trained. num_boost_round : int Number of boosting iterations. folds : a generator or iterator of (train_idx, test_idx) tuples The train indices and test indices for each folds. This argument has highest priority over other data split arguments. nfold : int Number of folds in CV. stratified : bool Perform stratified sampling. shuffle: bool Whether shuffle before split data metrics : string or list of strings Evaluation metrics to be watched in CV. If `metrics` is not None, the metric in `params` will be overridden. fobj : function Custom objective function. feval : function Custom evaluation function. init_model : file name of lightgbm model or 'Booster' instance model used for continued train feature_name : list of str, or 'auto' Feature names If 'auto' and data is pandas DataFrame, use data columns name categorical_feature : list of str or int, or 'auto' Categorical features, type int represents index, type str represents feature names (need to specify feature_name as well) If 'auto' and data is pandas DataFrame, use pandas categorical columns early_stopping_rounds: int Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue. Last entry in evaluation history is the one from best iteration. fpreproc : function Preprocessing function that takes (dtrain, dtest, param) and returns transformed versions of those. verbose_eval : bool, int, or None, default None Whether to display the progress. If None, progress will be displayed when np.ndarray is returned. If True, progress will be displayed at boosting stage. If an integer is given, progress will be displayed at every given `verbose_eval` boosting stage. show_stdv : bool, default True Whether to display the standard deviation in progress. Results are not affected, and always contains std. seed : int Seed used to generate the folds (passed to numpy.random.seed). callbacks : list of callback functions List of callback functions that are applied at each iteration. See Callbacks in Python-API.md for more information. Returns ------- evaluation history : list(string) """ if not isinstance(train_set, Dataset): raise TypeError("Traninig only accepts Dataset object") if isinstance(init_model, string_type): predictor = _InnerPredictor(model_file=init_model) elif isinstance(init_model, Booster): predictor = init_model._to_predictor() else: predictor = None train_set._update_params(params) train_set._set_predictor(predictor) train_set.set_feature_name(feature_name) train_set.set_categorical_feature(categorical_feature) if metrics is not None: params['metric'] = metrics results = collections.defaultdict(list) cvfolds = _make_n_folds(train_set, folds=folds, nfold=nfold, params=params, seed=seed, fpreproc=fpreproc, stratified=stratified, shuffle=shuffle) # setup callbacks if callbacks is None: callbacks = set() else: for i, cb in enumerate(callbacks): cb.__dict__.setdefault('order', i - len(callbacks)) callbacks = set(callbacks) if early_stopping_rounds is not None: callbacks.add(callback.early_stopping(early_stopping_rounds, verbose=False)) if verbose_eval is True: callbacks.add(callback.print_evaluation(show_stdv=show_stdv)) elif isinstance(verbose_eval, integer_types): callbacks.add(callback.print_evaluation(verbose_eval, show_stdv=show_stdv)) callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)} callbacks_after_iter = callbacks - callbacks_before_iter callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order')) callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order')) for i in range_(num_boost_round): for cb in callbacks_before_iter: cb(callback.CallbackEnv(model=cvfolds, params=params, iteration=i, begin_iteration=0, end_iteration=num_boost_round, evaluation_result_list=None)) cvfolds.update(fobj=fobj) res = _agg_cv_result(cvfolds.eval_valid(feval)) for _, key, mean, _, std in res: results[key + '-mean'].append(mean) results[key + '-stdv'].append(std) try: for cb in callbacks_after_iter: cb(callback.CallbackEnv(model=cvfolds, params=params, iteration=i, begin_iteration=0, end_iteration=num_boost_round, evaluation_result_list=res)) except callback.EarlyStopException as earlyStopException: cvfolds.best_iteration = earlyStopException.best_iteration + 1 for k in results: results[k] = results[k][:cvfolds.best_iteration] break return dict(results)