[python] avoid data copy where possible (#2383)

* avoid copy where possible * use precise type for importance type * removed pointless code * simplify sparse pandas Series conversion * more memory savings * always force type conversion for 1-D arrays * one more copy=False
2019-09-26 23:37:47 +03:00 · 2019-09-26 23:37:47 +03:00 · d064019f22
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@ -80,10 +80,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
    elif isinstance(data, Series):
        if _get_bad_pandas_dtypes([data.dtypes]):
            raise ValueError('Series.dtypes must be int, float or bool')
-        if hasattr(data.values, 'values'):  # SparseArray
-            return data.values.values.astype(dtype)
-        else:
-            return data.values.astype(dtype)
+        return np.array(data, dtype=dtype, copy=False)  # SparseArray should be supported as well
    else:
        raise TypeError("Wrong type({0}) for {1}.\n"
                        "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
@ -296,7 +293,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
                             "Did not expect the data types in the following fields: "
                             + ', '.join(data.columns[bad_indices]))
-        data = data.values.astype('float')
+        data = data.values
+        if data.dtype != np.float32 and data.dtype != np.float64:
+            data = data.astype(np.float32)
    else:
        if feature_name == 'auto':
            feature_name = None
@ -311,7 +310,7 @@ def _label_from_pandas(label):
            raise ValueError('DataFrame for label cannot have multiple columns')
        if _get_bad_pandas_dtypes(label.dtypes):
            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
-        label = label.values.astype('float').flatten()
+        label = np.ravel(label.values.astype(np.float32, copy=False))
    return label


@ -534,8 +533,7 @@ class _InnerPredictor(object):
        def inner_predict(mat, num_iteration, predict_type, preds=None):
            if mat.dtype == np.float32 or mat.dtype == np.float64:
                data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
-            else:
-                """change non-float data to float data, need to copy"""
+            else:  # change non-float data to float data, need to copy
                data = np.array(mat.reshape(mat.size), dtype=np.float32)
            ptr_data, type_ptr_data, _ = c_float_array(data)
            n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
@ -876,8 +874,7 @@ class Dataset(object):
        self.handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
-        else:
-            # change non-float data to float data, need to copy
+        else:  # change non-float data to float data, need to copy
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

        ptr_data, type_ptr_data, _ = c_float_array(data)
@ -915,8 +912,7 @@ class Dataset(object):

            if mat.dtype == np.float32 or mat.dtype == np.float64:
                mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
-            else:
-                # change non-float data to float data, need to copy
+            else:  # change non-float data to float data, need to copy
                mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

            chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
@ -1012,7 +1008,7 @@ class Dataset(object):
                    used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
                    assert used_indices.flags.c_contiguous
                    if self.reference.group is not None:
-                        group_info = np.array(self.reference.group).astype(int)
+                        group_info = np.array(self.reference.group).astype(np.int32, copy=False)
                        _, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
                                                  return_counts=True)
                    self.handle = ctypes.c_void_p()
@ -2512,7 +2508,7 @@ class Booster(object):
            ctypes.c_int(importance_type_int),
            result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
        if importance_type_int == 0:
-            return result.astype(int)
+            return result.astype(np.int32)
        else:
            return result

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@ -308,17 +308,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
        if hasattr(folds, 'split'):
            group_info = full_data.get_group()
            if group_info is not None:
-                group_info = np.array(group_info, dtype=int)
+                group_info = np.array(group_info, dtype=np.int32, copy=False)
                flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
            else:
-                flatted_group = np.zeros(num_data, dtype=int)
+                flatted_group = np.zeros(num_data, dtype=np.int32)
            folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
    else:
        if 'objective' in params and params['objective'] == 'lambdarank':
            if not SKLEARN_INSTALLED:
                raise LightGBMError('Scikit-learn is required for lambdarank cv.')
            # lambdarank task, split according to groups
-            group_info = np.array(full_data.get_group(), dtype=int)
+            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
            flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
            folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)