# coding: utf-8
# pylint: disable = invalid-name, C0111, C0301
# pylint: disable = R0912, R0913, R0914, W0105, W0201, W0212
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
import ctypes
import os
import warnings
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
from .compat import (DataFrame, Series, integer_types, json,
json_default_with_numpy, numeric_types, range_,
string_type)
from .libpath import find_lib_path
def _load_lib():
"""Load LightGBM Library."""
lib_path = find_lib_path()
if len(lib_path) == 0:
return None
lib = ctypes.cdll.LoadLibrary(lib_path[0])
lib.LGBM_GetLastError.restype = ctypes.c_char_p
return lib
_LIB = _load_lib()
class LightGBMError(Exception):
"""Error throwed by LightGBM"""
pass
def _safe_call(ret):
"""Check the return value of C API call
Parameters
----------
ret : int
return value from API calls
"""
if ret != 0:
raise LightGBMError(_LIB.LGBM_GetLastError())
def is_numeric(obj):
"""Check is a number or not, include numpy number etc."""
try:
float(obj)
return True
except (TypeError, ValueError):
# TypeError: obj is not a string or a number
# ValueError: invalid literal
return False
def is_numpy_1d_array(data):
"""Check is 1d numpy array"""
return isinstance(data, np.ndarray) and len(data.shape) == 1
def is_1d_list(data):
"""Check is 1d list"""
return isinstance(data, list) and \
(not data or isinstance(data[0], numeric_types))
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""convert to 1d numpy array"""
if is_numpy_1d_array(data):
if data.dtype == dtype:
return data
else:
return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, Series):
return data.values.astype(dtype)
else:
raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
return np.fromiter(cptr, dtype=np.float32, count=length)
else:
raise RuntimeError('Expected float pointer')
def cfloat64_array_to_numpy(cptr, length):
"""Convert a ctypes double pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
return np.fromiter(cptr, dtype=np.float64, count=length)
else:
raise RuntimeError('Expected double pointer')
def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
return np.fromiter(cptr, dtype=np.int32, count=length)
else:
raise RuntimeError('Expected int pointer')
def c_str(string):
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
"""Convert a python array to c array."""
return (ctype * len(values))(*values)
def param_dict_to_str(data):
if data is None or not data:
return ""
pairs = []
for key, val in data.items():
if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
pairs.append(str(key) + '=' + ','.join(map(str, val)))
elif isinstance(val, string_type) or isinstance(val, numeric_types) or is_numeric(val):
pairs.append(str(key) + '=' + str(val))
else:
raise TypeError('Unknown type of parameter:%s, got:%s'
% (key, type(val).__name__))
return ' '.join(pairs)
class _temp_file(object):
def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if os.path.isfile(self.name):
os.remove(self.name)
def readlines(self):
with open(self.name, "r+") as f:
ret = f.readlines()
return ret
def writelines(self, lines):
with open(self.name, "w+") as f:
f.writelines(lines)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR = 1
"""marco definition of prediction type in c_api of LightGBM"""
C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX = 2
"""data type of data field"""
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT64,
"group": C_API_DTYPE_INT32}
def c_float_array(data):
"""get pointer of float numpy array / list"""
if is_1d_list(data):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
else:
raise TypeError("Expected np.float32 or np.float64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
def c_int_array(data):
"""get pointer of int numpy array / list"""
if is_1d_list(data):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
elif data.dtype == np.int64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64
else:
raise TypeError("Expected np.int32 or np.int64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
if feature_name == 'auto' or feature_name is None:
if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]):
msg = """Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings.filterwarnings('once')
warnings.warn(msg, stacklevel=5)
data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float')
return label
def _save_pandas_categorical(file_name, pandas_categorical):
with open(file_name, 'a') as f:
f.write('\npandas_categorical:' + json.dumps(pandas_categorical, default=json_default_with_numpy))
def _load_pandas_categorical(file_name):
with open(file_name, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
return json.loads(last_line[len('pandas_categorical:'):])
return None
class _InnerPredictor(object):
"""
A _InnerPredictor of LightGBM.
Only used for prediction, usually used for continued-train
Note: Can convert from Booster, but cannot convert to Booster
"""
def __init__(self, model_file=None, booster_handle=None):
"""Initialize the _InnerPredictor. Not expose to user
Parameters
----------
model_file : string
Path to the model file.
booster_handle : Handle of Booster
use handle to init
"""
self.handle = ctypes.c_void_p()
self.__is_manage_handle = True
if model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif booster_handle is not None:
self.__is_manage_handle = False
self.handle = booster_handle
out_num_class = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else:
raise TypeError('Need Model file or Booster handle to create a predictor')
def __del__(self):
if self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def __getstate__(self):
this = self.__dict__.copy()
this.pop('handle', None)
return this
def predict(self, data, num_iteration=-1,
raw_score=False, pred_leaf=False, data_has_header=False,
is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data type is string, it represents the path of txt file
num_iteration : int
Used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data, True if txt data has header
is_reshape : bool
Reshape to (nrow, ncol) if true
Returns
-------
Prediction result
"""
if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 1 if data_has_header else 0
if num_iteration > self.num_total_iteration:
num_iteration = self.num_total_iteration
if isinstance(data, string_type):
with _temp_file() as f:
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
ctypes.c_int(int_data_has_header),
ctypes.c_int(predict_type),
ctypes.c_int(num_iteration),
c_str(f.name)))
lines = f.readlines()
nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float64, copy=False)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type)
elif isinstance(data, scipy.sparse.csc_matrix):
preds, nrow = self.__pred_for_csc(data, num_iteration,
predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
elif isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
preds, nrow = self.__pred_for_csr(csr, num_iteration,
predict_type)
except:
raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if is_reshape and preds.size != nrow:
if preds.size % nrow == 0:
preds = preds.reshape(nrow, -1)
else:
raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
% (preds.size, nrow))
return preds
def __get_num_preds(self, num_iteration, nrow, predict_type):
"""
Get size of prediction result
"""
n_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCalcNumPredict(
self.handle,
ctypes.c_int(nrow),
ctypes.c_int(predict_type),
ctypes.c_int(num_iteration),
ctypes.byref(n_preds)))
return n_preds.value
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
Predict for a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterPredictForMat(
self.handle,
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int(mat.shape[0]),
ctypes.c_int(mat.shape[1]),
ctypes.c_int(C_API_IS_ROW_MAJOR),
ctypes.c_int(predict_type),
ctypes.c_int(num_iteration),
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
"""
Predict for a csr data
"""
nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_BoosterPredictForCSR(
self.handle,
ptr_indptr,
ctypes.c_int32(type_ptr_indptr),
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int64(len(csr.indptr)),
ctypes.c_int64(len(csr.data)),
ctypes.c_int64(csr.shape[1]),
ctypes.c_int(predict_type),
ctypes.c_int(num_iteration),
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
def __pred_for_csc(self, csc, num_iteration, predict_type):
"""
Predict for a csc data
"""
nrow = csc.shape[0]
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_BoosterPredictForCSC(
self.handle,
ptr_indptr,
ctypes.c_int32(type_ptr_indptr),
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int64(len(csc.indptr)),
ctypes.c_int64(len(csc.data)),
ctypes.c_int64(csc.shape[0]),
ctypes.c_int(predict_type),
ctypes.c_int(num_iteration),
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
[docs]class Dataset(object):
"""Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False,
feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True):
"""
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
Max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
True if need to free raw data after construct inner dataset
"""
self.handle = None
self.data = data
self.label = label
self.max_bin = max_bin
self.reference = reference
self.weight = weight
self.group = group
self.silent = silent
self.feature_name = feature_name
self.categorical_feature = categorical_feature
self.params = params
self.free_raw_data = free_raw_data
self.used_indices = None
self._predictor = None
self.pandas_categorical = None
def __del__(self):
self._free_handle()
def _free_handle(self):
if self.handle is not None:
_safe_call(_LIB.LGBM_DatasetFree(self.handle))
self.handle = None
def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None,
silent=False, feature_name='auto',
categorical_feature='auto', params=None):
if data is None:
self.handle = None
return
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
params = {} if params is None else params
self.max_bin = max_bin
self.predictor = predictor
params["max_bin"] = max_bin
if silent:
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
"""get categorical features"""
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params)
"""process for reference dataset"""
ref_dataset = None
if isinstance(reference, Dataset):
ref_dataset = reference.construct().handle
elif reference is not None:
raise TypeError('Reference dataset should be None or dataset instance')
"""start construct data"""
if isinstance(data, string_type):
"""check data has header or not"""
if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true":
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix):
self.__init_from_csr(data, params_str, ref_dataset)
elif isinstance(data, scipy.sparse.csc_matrix):
self.__init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset)
else:
try:
csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset)
except:
raise TypeError('Cannot initialize Dataset from {}'.format(type(data).__name__))
if label is not None:
self.set_label(label)
if self.get_label() is None:
raise ValueError("Label should not be None")
if weight is not None:
self.set_weight(weight)
if group is not None:
self.set_group(group)
# load init score
if isinstance(self.predictor, _InnerPredictor):
init_score = self.predictor.predict(data,
raw_score=True,
data_has_header=self.data_has_header,
is_reshape=False)
if self.predictor.num_class > 1:
# need re group init score
new_init_score = np.zeros(init_score.size, dtype=np.float32)
num_data = self.num_data()
for i in range_(num_data):
for j in range_(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
self.set_init_score(init_score)
elif self.predictor is not None:
raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__))
# set feature names
self.set_feature_name(feature_name)
def __init_from_np2d(self, mat, params_str, ref_dataset):
"""
Initialize data from a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
_safe_call(_LIB.LGBM_DatasetCreateFromMat(
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int(mat.shape[0]),
ctypes.c_int(mat.shape[1]),
ctypes.c_int(C_API_IS_ROW_MAJOR),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __init_from_csr(self, csr, params_str, ref_dataset):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSR(
ptr_indptr,
ctypes.c_int(type_ptr_indptr),
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int64(len(csr.indptr)),
ctypes.c_int64(len(csr.data)),
ctypes.c_int64(csr.shape[1]),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __init_from_csc(self, csc, params_str, ref_dataset):
"""
Initialize data from a csc matrix.
"""
if len(csc.indices) != len(csc.data):
raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSC(
ptr_indptr,
ctypes.c_int(type_ptr_indptr),
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
ctypes.c_int(type_ptr_data),
ctypes.c_int64(len(csc.indptr)),
ctypes.c_int64(len(csc.data)),
ctypes.c_int64(csc.shape[0]),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
[docs] def construct(self):
"""Lazy init"""
if self.handle is None:
if self.reference is not None:
if self.used_indices is None:
"""create valid"""
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin, reference=self.reference,
weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name, params=self.params)
else:
"""construct subset"""
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
self.handle = ctypes.c_void_p()
params_str = param_dict_to_str(self.params)
_safe_call(_LIB.LGBM_DatasetGetSubset(
self.reference.construct().handle,
used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ctypes.c_int(used_indices.shape[0]),
c_str(params_str),
ctypes.byref(self.handle)))
if self.get_label() is None:
raise ValueError("Label should not be None.")
else:
"""create train"""
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data:
self.data = None
return self
[docs] def create_valid(self, data, label=None, weight=None, group=None,
silent=False, params=None):
"""
Create validation data align with current dataset
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
Other parameters
"""
ret = Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret
[docs] def subset(self, used_indices, params=None):
"""
Get subset of current dataset
Parameters
----------
used_indices : list of int
Used indices of this subset
params : dict
Other parameters
"""
ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
return ret
[docs] def save_binary(self, filename):
"""
Save Dataset to binary file
Parameters
----------
filename : string
Name of the output file.
"""
_safe_call(_LIB.LGBM_DatasetSaveBinary(
self.construct().handle,
c_str(filename)))
def _update_params(self, params):
if not self.params:
self.params = params
else:
self.params.update(params)
[docs] def set_field(self, field_name, data):
"""Set property into the Dataset.
Parameters
----------
field_name: str
The field name of the information
data: numpy array or list or None
The array ofdata to be set
"""
if self.handle is None:
raise Exception("Cannot set %s before construct dataset" % field_name)
if data is None:
"""set to None"""
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
None,
ctypes.c_int(0),
ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
return
dtype = np.float32
if field_name == 'group':
dtype = np.int32
elif field_name == 'init_score':
dtype = np.float64
data = list_to_1d_numpy(data, dtype, name=field_name)
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
elif data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
else:
raise TypeError("Excepted np.float32/64 or np.int32, meet type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("Input type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
ptr_data,
ctypes.c_int(len(data)),
ctypes.c_int(type_data)))
[docs] def get_field(self, field_name):
"""Get property from the Dataset.
Parameters
----------
field_name: str
The field name of the information
Returns
-------
info : array
A numpy array of information of the data
"""
if self.handle is None:
raise Exception("Cannot get %s before construct dataset" % field_name)
tmp_out_len = ctypes.c_int()
out_type = ctypes.c_int()
ret = ctypes.POINTER(ctypes.c_void_p)()
_safe_call(_LIB.LGBM_DatasetGetField(
self.handle,
c_str(field_name),
ctypes.byref(tmp_out_len),
ctypes.byref(ret),
ctypes.byref(out_type)))
if out_type.value != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("Return type error for get_field")
if tmp_out_len.value == 0:
return None
if out_type.value == C_API_DTYPE_INT32:
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT64:
return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
else:
raise TypeError("Unknown type")
[docs] def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if self.categorical_feature == categorical_feature:
return
if self.data is not None:
self.categorical_feature = categorical_feature
self._free_handle()
else:
raise LightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def _set_predictor(self, predictor):
"""
Set predictor for continued training, not recommand for user to call this function.
Please set init_model in engine.train or engine.cv
"""
if predictor is self._predictor:
return
if self.data is not None:
self._predictor = predictor
self._free_handle()
else:
raise LightGBMError("Cannot set predictor after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
[docs] def set_reference(self, reference):
"""
Set reference dataset
Parameters
----------
reference : Dataset
Will use reference as template to consturct current dataset
"""
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name)
self._set_predictor(reference._predictor)
if self.reference is reference:
return
if self.data is not None:
self.reference = reference
self._free_handle()
else:
raise LightGBMError("Cannot set reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
[docs] def set_feature_name(self, feature_name):
"""
Set feature name
Parameters
----------
feature_name : list of str
Feature names
"""
if feature_name != 'auto':
self.feature_name = feature_name
if self.handle is not None and feature_name is not None and feature_name != 'auto':
if len(feature_name) != self.num_feature():
raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(name) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle,
c_array(ctypes.c_char_p, c_feature_name),
ctypes.c_int(len(feature_name))))
[docs] def set_label(self, label):
"""
Set label of Dataset
Parameters
----------
label: numpy array or list or None
The label information to be set into Dataset
"""
self.label = label
if self.handle is not None:
label = list_to_1d_numpy(label, name='label')
self.set_field('label', label)
[docs] def set_weight(self, weight):
"""
Set weight of each instance.
Parameters
----------
weight : numpy array or list or None
Weight for each data point
"""
self.weight = weight
if self.handle is not None and weight is not None:
weight = list_to_1d_numpy(weight, name='weight')
self.set_field('weight', weight)
[docs] def set_init_score(self, init_score):
"""
Set init score of booster to start from.
Parameters
----------
init_score: numpy array or list or None
Init score for booster
"""
self.init_score = init_score
if self.handle is not None and init_score is not None:
init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
self.set_field('init_score', init_score)
[docs] def set_group(self, group):
"""
Set group size of Dataset (used for ranking).
Parameters
----------
group : numpy array or list or None
Group size of each group
"""
self.group = group
if self.handle is not None and group is not None:
group = list_to_1d_numpy(group, np.int32, name='group')
self.set_field('group', group)
[docs] def get_label(self):
"""
Get the label of the Dataset.
Returns
-------
label : array
"""
if self.label is None and self.handle is not None:
self.label = self.get_field('label')
return self.label
[docs] def get_weight(self):
"""
Get the weight of the Dataset.
Returns
-------
weight : array
"""
if self.weight is None and self.handle is not None:
self.weight = self.get_field('weight')
return self.weight
[docs] def get_init_score(self):
"""
Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if self.init_score is None and self.handle is not None:
self.init_score = self.get_field('init_score')
return self.init_score
[docs] def get_group(self):
"""
Get the group of the Dataset.
Returns
-------
init_score : array
"""
if self.group is None and self.handle is not None:
self.group = self.get_field('group')
if self.group is not None:
# group data from LightGBM is boundaries data, need to convert to group size
new_group = []
for i in range_(len(self.group) - 1):
new_group.append(self.group[i + 1] - self.group[i])
self.group = new_group
return self.group
[docs] def num_data(self):
"""
Get the number of rows in the Dataset.
Returns
-------
number of rows : int
"""
if self.handle is not None:
ret = ctypes.c_int()
_safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
ctypes.byref(ret)))
return ret.value
else:
raise LightGBMError("Cannot get num_data before construct dataset")
[docs] def num_feature(self):
"""
Get the number of columns (features) in the Dataset.
Returns
-------
number of columns : int
"""
if self.handle is not None:
ret = ctypes.c_int()
_safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
ctypes.byref(ret)))
return ret.value
else:
raise LightGBMError("Cannot get num_feature before construct dataset")
[docs]class Booster(object):
""""Booster in LightGBM."""
def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""
Initialize the Booster.
Parameters
----------
params : dict
Parameters for boosters.
train_set : Dataset
Training dataset
model_file : string
Path to the model file.
silent : boolean, optional
Whether print messages during construction
"""
self.handle = ctypes.c_void_p()
self.__need_reload_eval_info = True
self.__train_data_name = "training"
self.__attr = {}
self.best_iteration = -1
self.best_score = {}
params = {} if params is None else params
if silent:
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
if train_set is not None:
"""Training task"""
if not isinstance(train_set, Dataset):
raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params)
"""construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate(
train_set.construct().handle,
c_str(params_str),
ctypes.byref(self.handle)))
"""save reference to data"""
self.train_set = train_set
self.valid_sets = []
self.name_valid_sets = []
self.__num_dataset = 1
self.__init_predictor = train_set._predictor
if self.__init_predictor is not None:
_safe_call(_LIB.LGBM_BoosterMerge(
self.handle,
self.__init_predictor.handle))
out_num_class = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
"""buffer for inner predict"""
self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif 'model_str' in params:
self.__load_model_from_string(params['model_str'])
else:
raise TypeError('Need at least one training dataset or model file to create booster instance')
def __del__(self):
if self.handle is not None:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def __copy__(self):
return self.__deepcopy__(None)
def __deepcopy__(self, _):
model_str = self.__save_model_to_string()
booster = Booster({'model_str': model_str})
booster.pandas_categorical = self.pandas_categorical
return booster
def __getstate__(self):
this = self.__dict__.copy()
handle = this['handle']
this.pop('train_set', None)
this.pop('valid_sets', None)
if handle is not None:
this["handle"] = self.__save_model_to_string()
return this
def __setstate__(self, state):
model_str = state.get('handle', None)
if model_str is not None:
handle = ctypes.c_void_p()
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterLoadModelFromString(
c_str(model_str),
ctypes.byref(out_num_iterations),
ctypes.byref(handle)))
state['handle'] = handle
self.__dict__.update(state)
def free_dataset(self):
self.__dict__.pop('train_set', None)
self.__dict__.pop('valid_sets', None)
def set_train_data_name(self, name):
self.__train_data_name = name
[docs] def add_valid(self, data, name):
"""
Add an validation data
Parameters
----------
data : Dataset
Validation data
name : String
Name of validation data
"""
if not isinstance(data, Dataset):
raise TypeError('valid data should be Dataset instance, met {}'.format(type(data).__name__))
if data._predictor is not self.__init_predictor:
raise LightGBMError("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData(
self.handle,
data.construct().handle))
self.valid_sets.append(data)
self.name_valid_sets.append(name)
self.__num_dataset += 1
self.__inner_predict_buffer.append(None)
self.__is_predicted_cur_iter.append(False)
[docs] def reset_parameter(self, params):
"""
Reset parameters for booster
Parameters
----------
params : dict
New parameters for boosters
silent : boolean, optional
Whether print messages during construction
"""
if 'metric' in params:
self.__need_reload_eval_info = True
params_str = param_dict_to_str(params)
if params_str:
_safe_call(_LIB.LGBM_BoosterResetParameter(
self.handle,
c_str(params_str)))
[docs] def update(self, train_set=None, fobj=None):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
train_set :
Training data, None means use last training data
fobj : function
Customized objective function.
Returns
-------
is_finished, bool
"""
"""need reset training data"""
if train_set is not None and train_set is not self.train_set:
if not isinstance(train_set, Dataset):
raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
if train_set._predictor is not self.__init_predictor:
raise LightGBMError("Replace training data failed, you should use same predictor for these data")
self.train_set = train_set
_safe_call(_LIB.LGBM_BoosterResetTrainingData(
self.handle,
self.train_set.construct().handle))
self.__inner_predict_buffer[0] = None
is_finished = ctypes.c_int(0)
if fobj is None:
_safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle,
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
return is_finished.value == 1
else:
grad, hess = fobj(self.__inner_predict(0), self.train_set)
return self.__boost(grad, hess)
def __boost(self, grad, hess):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
grad : 1d numpy or 1d list
The first order of gradient.
hess : 1d numpy or 1d list
The second order of gradient.
Returns
-------
is_finished, bool
"""
grad = list_to_1d_numpy(grad, name='gradient')
hess = list_to_1d_numpy(hess, name='hessian')
if len(grad) != len(hess):
raise ValueError("Lengths of gradient({}) and hessian({}) don't match".format(len(grad), len(hess)))
is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle,
grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
return is_finished.value == 1
[docs] def rollback_one_iter(self):
"""
Rollback one iteration
"""
_safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle))
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
def current_iteration(self):
out_cur_iter = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(out_cur_iter)))
return out_cur_iter.value
[docs] def eval(self, data, name, feval=None):
"""
Evaluate for data
Parameters
----------
data : Dataset object
name :
Name of data
feval : function
Custom evaluation function.
Returns
-------
result: list
Evaluation result list.
"""
if not isinstance(data, Dataset):
raise TypeError("Can only eval for Dataset instance")
data_idx = -1
if data is self.train_set:
data_idx = 0
else:
for i in range_(len(self.valid_sets)):
if data is self.valid_sets[i]:
data_idx = i + 1
break
"""need to push new valid data"""
if data_idx == -1:
self.add_valid(data, name)
data_idx = self.__num_dataset - 1
return self.__inner_eval(name, data_idx, feval)
[docs] def eval_train(self, feval=None):
"""
Evaluate for training data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
"""
return self.__inner_eval(self.__train_data_name, 0, feval)
[docs] def eval_valid(self, feval=None):
"""
Evaluate for validation data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
"""
return [item for i in range_(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
[docs] def save_model(self, filename, num_iteration=-1):
"""
Save model of booster to file
Parameters
----------
filename : str
Filename to save
num_iteration: int
Number of iteration that want to save. < 0 means save the best iteration(if have)
"""
if num_iteration <= 0:
num_iteration = self.best_iteration
_safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle,
ctypes.c_int(num_iteration),
c_str(filename)))
_save_pandas_categorical(filename, self.pandas_categorical)
def __load_model_from_string(self, model_str):
"""[Private] Load model from string"""
out_num_iterations = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterLoadModelFromString(
c_str(model_str),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
def __save_model_to_string(self, num_iteration=-1):
"""[Private] Save model to string"""
if num_iteration <= 0:
num_iteration = self.best_iteration
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterSaveModelToString(
self.handle,
ctypes.c_int(num_iteration),
ctypes.c_int(buffer_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
actual_len = tmp_out_len.value
'''if buffer length is not long enough, re-allocate a buffer'''
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterSaveModelToString(
self.handle,
ctypes.c_int(num_iteration),
ctypes.c_int(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
return string_buffer.value.decode()
[docs] def dump_model(self, num_iteration=-1):
"""
Dump model to json format
Parameters
----------
num_iteration: int
Number of iteration that want to dump. < 0 means dump to best iteration(if have)
Returns
-------
Json format of model
"""
if num_iteration <= 0:
num_iteration = self.best_iteration
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
ctypes.c_int(num_iteration),
ctypes.c_int(buffer_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
actual_len = tmp_out_len.value
'''if buffer length is not long enough, reallocate a buffer'''
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
ctypes.c_int(num_iteration),
ctypes.c_int(actual_len),
ctypes.byref(tmp_out_len),
ptr_string_buffer))
return json.loads(string_buffer.value.decode())
[docs] def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data type is string, it represents the path of txt file
num_iteration : int
Used iteration for prediction, < 0 means predict for best iteration(if have)
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
Reshape to (nrow, ncol) if true
Returns
-------
Prediction result
"""
predictor = self._to_predictor()
if num_iteration <= 0:
num_iteration = self.best_iteration
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
def _to_predictor(self):
"""Convert to predictor"""
predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor
[docs] def feature_name(self):
"""
Get feature names.
Returns
-------
result : array
Array of feature names.
"""
out_num_feature = ctypes.c_int(0)
"""Get num of features"""
_safe_call(_LIB.LGBM_BoosterGetNumFeature(
self.handle,
ctypes.byref(out_num_feature)))
num_feature = out_num_feature.value
"""Get name of features"""
tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetFeatureNames(
self.handle,
ctypes.byref(tmp_out_len),
ptr_string_buffers))
if num_feature != tmp_out_len.value:
raise ValueError("Length of feature names doesn't equal with num_feature")
return [string_buffers[i].value.decode() for i in range_(num_feature)]
[docs] def feature_importance(self, importance_type='split'):
"""
Get feature importances
Parameters
----------
importance_type : str, default "split"
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
Returns
-------
result : array
Array of feature importances.
"""
if importance_type not in ["split", "gain"]:
raise KeyError("importance_type must be split or gain")
dump_model = self.dump_model()
ret = [0] * (dump_model["max_feature_idx"] + 1)
def dfs(root):
if "split_feature" in root:
if root['split_gain'] > 0:
if importance_type == 'split':
ret[root["split_feature"]] += 1
elif importance_type == 'gain':
ret[root["split_feature"]] += root["split_gain"]
dfs(root["left_child"])
dfs(root["right_child"])
for tree in dump_model["tree_info"]:
dfs(tree["tree_structure"])
return np.array(ret)
def __inner_eval(self, data_name, data_idx, feval=None):
"""
Evaulate training or validation data
"""
if data_idx >= self.__num_dataset:
raise ValueError("Data_idx should be smaller than number of dataset")
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range_(self.__num_inner_eval)], dtype=np.float64)
tmp_out_len = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
ctypes.c_int(data_idx),
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("Wrong length of eval results")
for i in range_(self.__num_inner_eval):
ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
if feval is not None:
if data_idx == 0:
cur_data = self.train_set
else:
cur_data = self.valid_sets[data_idx - 1]
feval_ret = feval(self.__inner_predict(data_idx), cur_data)
if isinstance(feval_ret, list):
for eval_name, val, is_higher_better in feval_ret:
ret.append((data_name, eval_name, val, is_higher_better))
else:
eval_name, val, is_higher_better = feval_ret
ret.append((data_name, eval_name, val, is_higher_better))
return ret
def __inner_predict(self, data_idx):
"""
Predict for training and validation dataset
"""
if data_idx >= self.__num_dataset:
raise ValueError("Data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None:
if data_idx == 0:
n_preds = self.train_set.num_data() * self.__num_class
else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range_(n_preds)], dtype=np.float64, copy=False)
"""avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
_safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle,
ctypes.c_int(data_idx),
ctypes.byref(tmp_out_len),
data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("Wrong length of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx]
def __get_eval_info(self):
"""
Get inner evaluation count and names
"""
if self.__need_reload_eval_info:
self.__need_reload_eval_info = False
out_num_eval = ctypes.c_int(0)
"""Get num of inner evals"""
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_num_eval)))
self.__num_inner_eval = out_num_eval.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(tmp_out_len),
ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("Length of eval names doesn't equal with num_evals")
self.__name_inner_eval = \
[string_buffers[i].value.decode() for i in range_(self.__num_inner_eval)]
self.__higher_better_inner_eval = \
[name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
[docs] def attr(self, key):
"""
Get attribute string from the Booster.
Parameters
----------
key : str
The key to get attribute from.
Returns
-------
value : str
The attribute value of the key, returns None if attribute do not exist.
"""
return self.__attr.get(key, None)
[docs] def set_attr(self, **kwargs):
"""
Set the attribute of the Booster.
Parameters
----------
**kwargs
The attributes to set. Setting a value to None deletes an attribute.
"""
for key, value in kwargs.items():
if value is not None:
if not isinstance(value, string_type):
raise ValueError("Set attr only accepts strings")
self.__attr[key] = value
else:
self.__attr.pop(key, None)