[python] avoid data copy where possible (#2383)

* avoid copy where possible

* use precise type for importance type

* removed pointless code

* simplify sparse pandas Series conversion

* more memory savings

* always force type conversion for 1-D arrays

* one more copy=False
This commit is contained in:
Nikita Titov 2019-09-26 23:37:47 +03:00 коммит произвёл GitHub
Родитель 7a8c4e52b6
Коммит d064019f22
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 13 добавлений и 17 удалений

Просмотреть файл

@ -80,10 +80,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif isinstance(data, Series):
if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well
else:
raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
@ -296,7 +293,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: "
+ ', '.join(data.columns[bad_indices]))
data = data.values.astype('float')
data = data.values
if data.dtype != np.float32 and data.dtype != np.float64:
data = data.astype(np.float32)
else:
if feature_name == 'auto':
feature_name = None
@ -311,7 +310,7 @@ def _label_from_pandas(label):
raise ValueError('DataFrame for label cannot have multiple columns')
if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten()
label = np.ravel(label.values.astype(np.float32, copy=False))
return label
@ -534,8 +533,7 @@ class _InnerPredictor(object):
def inner_predict(mat, num_iteration, predict_type, preds=None):
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
@ -876,8 +874,7 @@ class Dataset(object):
self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data)
@ -915,8 +912,7 @@ class Dataset(object):
if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
@ -1012,7 +1008,7 @@ class Dataset(object):
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
assert used_indices.flags.c_contiguous
if self.reference.group is not None:
group_info = np.array(self.reference.group).astype(int)
group_info = np.array(self.reference.group).astype(np.int32, copy=False)
_, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
return_counts=True)
self.handle = ctypes.c_void_p()
@ -2512,7 +2508,7 @@ class Booster(object):
ctypes.c_int(importance_type_int),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if importance_type_int == 0:
return result.astype(int)
return result.astype(np.int32)
else:
return result

Просмотреть файл

@ -308,17 +308,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
if hasattr(folds, 'split'):
group_info = full_data.get_group()
if group_info is not None:
group_info = np.array(group_info, dtype=int)
group_info = np.array(group_info, dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=int)
flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for lambdarank cv.')
# lambdarank task, split according to groups
group_info = np.array(full_data.get_group(), dtype=int)
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)