[docs] unify language and make small improvements in some param descriptions (#6618)

2024-08-27 05:52:12 +03:00 · 2024-08-27 05:52:12 +03:00 · a9df7f113f
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@ -127,10 +127,10 @@ Core Parameters

      -  ``custom``

-      -  **Note**: Not supported in CLI version
-
      -  must be passed through parameters explicitly in the C API

+      -  **Note**: cannot be used in CLI version
+
 -  ``boosting`` :raw-html:`<a id="boosting" title="Permalink to this parameter" href="#boosting">&#x1F517;&#xFE0E;</a>`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, aliases: ``boosting_type``, ``boost``

   -  ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt``
@ -225,7 +225,7 @@ Core Parameters

   -  **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training

-   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
+   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU or CUDA support

 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``

@ -358,7 +358,7 @@ Learning Control Parameters

   -  frequency for bagging

-   -  ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations
+   -  ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100%`` of the data to use for the next ``k`` iterations

   -  **Note**: bagging is only effective when ``0.0 < bagging_fraction < 1.0``

@ -470,7 +470,7 @@ Learning Control Parameters

   -  used only in ``dart``

-   -  set this to ``true``, if you want to use xgboost dart mode
+   -  set this to ``true``, if you want to use XGBoost DART mode

 -  ``uniform_drop`` :raw-html:`<a id="uniform_drop" title="Permalink to this parameter" href="#uniform_drop">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool

@ -498,6 +498,8 @@ Learning Control Parameters

 -  ``min_data_per_group`` :raw-html:`<a id="min_data_per_group" title="Permalink to this parameter" href="#min_data_per_group">&#x1F517;&#xFE0E;</a>`, default = ``100``, type = int, constraints: ``min_data_per_group > 0``

+   -  used for the categorical features
+
   -  minimal number of data per categorical group

 -  ``max_cat_threshold`` :raw-html:`<a id="max_cat_threshold" title="Permalink to this parameter" href="#max_cat_threshold">&#x1F517;&#xFE0E;</a>`, default = ``32``, type = int, constraints: ``max_cat_threshold > 0``
@ -522,6 +524,8 @@ Learning Control Parameters

 -  ``max_cat_to_onehot`` :raw-html:`<a id="max_cat_to_onehot" title="Permalink to this parameter" href="#max_cat_to_onehot">&#x1F517;&#xFE0E;</a>`, default = ``4``, type = int, constraints: ``max_cat_to_onehot > 0``

+   -  used for the categorical features
+
   -  when number of categories of one feature smaller than or equal to ``max_cat_to_onehot``, one-vs-other split algorithm will be used

 -  ``top_k`` :raw-html:`<a id="top_k" title="Permalink to this parameter" href="#top_k">&#x1F517;&#xFE0E;</a>`, default = ``20``, type = int, aliases: ``topk``, constraints: ``top_k > 0``
@ -536,7 +540,7 @@ Learning Control Parameters

   -  ``1`` means increasing, ``-1`` means decreasing, ``0`` means non-constraint

-   -  you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
+   -  you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for the 1st feature, non-constraint for the 2nd feature and increasing for the 3rd feature

 -  ``monotone_constraints_method`` :raw-html:`<a id="monotone_constraints_method" title="Permalink to this parameter" href="#monotone_constraints_method">&#x1F517;&#xFE0E;</a>`, default = ``basic``, type = enum, options: ``basic``, ``intermediate``, ``advanced``, aliases: ``monotone_constraining_method``, ``mc_method``

@ -544,11 +548,11 @@ Learning Control Parameters

   -  monotone constraints method

-      -  ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions
+      -  ``basic``, the most basic monotone constraints method. It does not slow down the training speed at all, but over-constrains the predictions

-      -  ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
+      -  ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow down the training speed very slightly. However, this method is much less constraining than the basic method and should significantly improve the results

-      -  ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
+      -  ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow down the training speed. However, this method is even less constraining than the intermediate method and should again significantly improve the results

 -  ``monotone_penalty`` :raw-html:`<a id="monotone_penalty" title="Permalink to this parameter" href="#monotone_penalty">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``monotone_splits_penalty``, ``ms_penalty``, ``mc_penalty``, constraints: ``monotone_penalty >= 0.0``

@ -608,7 +612,7 @@ Learning Control Parameters

   -  helps prevent overfitting on leaves with few samples

-   -  if set to zero, no smoothing is applied
+   -  if ``0.0`` (the default), no smoothing is applied

   -  if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``

@ -628,7 +632,7 @@ Learning Control Parameters

      -  for Python-package, list of lists, e.g. ``[[0, 1, 2], [2, 3]]``

-      -  for R-package, list of character or numeric vectors, e.g. ``list(c("var1", "var2", "var3"), c("var3", "var4"))`` or ``list(c(1L, 2L, 3L), c(3L, 4L))``. Numeric vectors should use 1-based indexing, where ``1L`` is the first feature, ``2L`` is the second feature, etc
+      -  for R-package, list of character or numeric vectors, e.g. ``list(c("var1", "var2", "var3"), c("var3", "var4"))`` or ``list(c(1L, 2L, 3L), c(3L, 4L))``. Numeric vectors should use 1-based indexing, where ``1L`` is the first feature, ``2L`` is the second feature, etc.

   -  any two features can only appear in the same branch only if there exists a constraint containing both features

@ -680,35 +684,41 @@ Learning Control Parameters

   -  gradient quantization can accelerate training, with little accuracy drop in most cases

-   -  **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+   -  **Note**: works only with ``cpu`` and ``cuda`` device type

   -  *New in version 4.0.0*

 -  ``num_grad_quant_bins`` :raw-html:`<a id="num_grad_quant_bins" title="Permalink to this parameter" href="#num_grad_quant_bins">&#x1F517;&#xFE0E;</a>`, default = ``4``, type = int

+   -  used only if ``use_quantized_grad=true``
+
   -  number of bins to quantization gradients and hessians

   -  with more bins, the quantized training will be closer to full precision training

-   -  **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+   -  **Note**: works only with ``cpu`` and ``cuda`` device type

   -  *New in version 4.0.0*

 -  ``quant_train_renew_leaf`` :raw-html:`<a id="quant_train_renew_leaf" title="Permalink to this parameter" href="#quant_train_renew_leaf">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool

+   -  used only if ``use_quantized_grad=true``
+
   -  whether to renew the leaf values with original gradients when quantized training

   -  renewing is very helpful for good quantized training accuracy for ranking objectives

-   -  **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+   -  **Note**: works only with ``cpu`` and ``cuda`` device type

   -  *New in version 4.0.0*

 -  ``stochastic_rounding`` :raw-html:`<a id="stochastic_rounding" title="Permalink to this parameter" href="#stochastic_rounding">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool

+   -  used only if ``use_quantized_grad=true``
+
   -  whether to use stochastic rounding in gradient quantization

-   -  **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+   -  **Note**: works only with ``cpu`` and ``cuda`` device type

   -  *New in version 4.0.0*

@ -722,25 +732,25 @@ Dataset Parameters

   -  fit piecewise linear gradient boosting tree

-      -  tree splits are chosen in the usual way, but the model at each leaf is linear instead of constant
+   -  tree splits are chosen in the usual way, but the model at each leaf is linear instead of constant

-      -  the linear model at each leaf includes all the numerical features in that leaf's branch
+   -  the linear model at each leaf includes all the numerical features in that leaf's branch

-      -  the first tree has constant leaf values
+   -  the first tree has constant leaf values

-      -  categorical features are used for splits as normal but are not used in the linear models
+   -  categorical features are used for splits as normal but are not used in the linear models

-      -  missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R
+   -  missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R

-      -  it is recommended to rescale data before training so that features have similar mean and standard deviation
+   -  it is recommended to rescale data before training so that features have similar mean and standard deviation

-      -  **Note**: only works with CPU and ``serial`` tree learner
+   -  **Note**: works only with ``cpu`` device type and ``serial`` tree learner

-      -  **Note**: ``regression_l1`` objective is not supported with linear tree boosting
+   -  **Note**: ``regression_l1`` objective is not supported with linear tree boosting

-      -  **Note**: setting ``linear_tree=true`` significantly increases the memory use of LightGBM
+   -  **Note**: setting ``linear_tree=true`` significantly increases the memory use of LightGBM

-      -  **Note**: if you specify ``monotone_constraints``, constraints will be enforced when choosing the split points, but not when fitting the linear models on leaves
+   -  **Note**: if you specify ``monotone_constraints``, constraints will be enforced when choosing the split points, but not when fitting the linear models on leaves

 -  ``max_bin`` :raw-html:`<a id="max_bin" title="Permalink to this parameter" href="#max_bin">&#x1F517;&#xFE0E;</a>`, default = ``255``, type = int, aliases: ``max_bins``, constraints: ``max_bin > 1``

@ -1005,13 +1015,13 @@ Predict Parameters

 -  ``pred_early_stop_freq`` :raw-html:`<a id="pred_early_stop_freq" title="Permalink to this parameter" href="#pred_early_stop_freq">&#x1F517;&#xFE0E;</a>`, default = ``10``, type = int

-   -  used only in ``prediction`` task
+   -  used only in ``prediction`` task and if ``pred_early_stop=true``

   -  the frequency of checking early-stopping prediction

 -  ``pred_early_stop_margin`` :raw-html:`<a id="pred_early_stop_margin" title="Permalink to this parameter" href="#pred_early_stop_margin">&#x1F517;&#xFE0E;</a>`, default = ``10.0``, type = double

-   -  used only in ``prediction`` task
+   -  used only in ``prediction`` task and if ``pred_early_stop=true``

   -  the threshold of margin in early-stopping prediction

@ -1151,7 +1161,9 @@ Objective Parameters

 -  ``lambdarank_position_bias_regularization`` :raw-html:`<a id="lambdarank_position_bias_regularization" title="Permalink to this parameter" href="#lambdarank_position_bias_regularization">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``lambdarank_position_bias_regularization >= 0.0``

-   -  used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+   -  used only in ``lambdarank`` application when positional information is provided and position bias is modeled
+
+   -  larger values reduce the inferred position bias factors

   -  *New in version 4.1.0*

@ -1263,7 +1275,7 @@ Network Parameters

   -  the number of machines for distributed learning application

-   -  this parameter is needed to be set in both **socket** and **mpi** versions
+   -  this parameter is needed to be set in both **socket** and **MPI** versions

 -  ``local_listen_port`` :raw-html:`<a id="local_listen_port" title="Permalink to this parameter" href="#local_listen_port">&#x1F517;&#xFE0E;</a>`, default = ``12400 (random for Dask-package)``, type = int, aliases: ``local_port``, ``port``, constraints: ``local_listen_port > 0``

@ -1292,6 +1304,8 @@ GPU Parameters

 -  ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

+   -  used only with ``gpu`` device type
+
   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform

   -  ``-1`` means the system-wide default platform
@ -1300,7 +1314,7 @@ GPU Parameters

 -  ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

-   -  OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+   -  OpenCL device ID in the specified platform or CUDA device ID. Each GPU in the selected platform has a unique device ID

   -  ``-1`` means the default device in the selected platform

@ -1310,13 +1324,13 @@ GPU Parameters

   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)

-   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
+   -  **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported

 -  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``

   -  number of GPUs

-   -  **Note**: can be used only in CUDA implementation
+   -  **Note**: can be used only in CUDA implementation (``device_type="cuda"``)

 .. end params list

--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@ -160,8 +160,8 @@ struct Config {
  // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
  // desc = custom objective function (gradients and hessians not computed directly by LightGBM)
  // descl2 = ``custom``
-  // descl2 = **Note**: Not supported in CLI version
  // descl2 = must be passed through parameters explicitly in the C API
+  // descl2 = **Note**: cannot be used in CLI version
  std::string objective = "regression";

  // [no-automatically-extract]
@ -249,7 +249,7 @@ struct Config {
  // desc = ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
  // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
  // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
-  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
+  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst>`__ to build LightGBM with GPU or CUDA support
  std::string device_type = "cpu";

  // [no-automatically-extract]
@ -350,7 +350,7 @@ struct Config {

  // alias = subsample_freq
  // desc = frequency for bagging
-  // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations
+  // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100%`` of the data to use for the next ``k`` iterations
  // desc = **Note**: bagging is only effective when ``0.0 < bagging_fraction < 1.0``
  int bagging_freq = 0;

@ -447,7 +447,7 @@ struct Config {
  double skip_drop = 0.5;

  // desc = used only in ``dart``
-  // desc = set this to ``true``, if you want to use xgboost dart mode
+  // desc = set this to ``true``, if you want to use XGBoost DART mode
  bool xgboost_dart_mode = false;

  // desc = used only in ``dart``
@ -471,6 +471,7 @@ struct Config {
  double other_rate = 0.1;

  // check = >0
+  // desc = used for the categorical features
  // desc = minimal number of data per categorical group
  int min_data_per_group = 100;

@ -491,6 +492,7 @@ struct Config {
  double cat_smooth = 10.0;

  // check = >0
+  // desc = used for the categorical features
  // desc = when number of categories of one feature smaller than or equal to ``max_cat_to_onehot``, one-vs-other split algorithm will be used
  int max_cat_to_onehot = 4;

@ -505,7 +507,7 @@ struct Config {
  // default = None
  // desc = used for constraints of monotonic features
  // desc = ``1`` means increasing, ``-1`` means decreasing, ``0`` means non-constraint
-  // desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature
+  // desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for the 1st feature, non-constraint for the 2nd feature and increasing for the 3rd feature
  std::vector<int8_t> monotone_constraints;

  // type = enum
@ -513,9 +515,9 @@ struct Config {
  // options = basic, intermediate, advanced
  // desc = used only if ``monotone_constraints`` is set
  // desc = monotone constraints method
-  // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions
-  // descl2 = ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
-  // descl2 = ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
+  // descl2 = ``basic``, the most basic monotone constraints method. It does not slow down the training speed at all, but over-constrains the predictions
+  // descl2 = ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow down the training speed very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
+  // descl2 = ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow down the training speed. However, this method is even less constraining than the intermediate method and should again significantly improve the results
  std::string monotone_constraints_method = "basic";

  // alias = monotone_splits_penalty, ms_penalty, mc_penalty
@ -569,7 +571,7 @@ struct Config {
  // check = >= 0.0
  // desc = controls smoothing applied to tree nodes
  // desc = helps prevent overfitting on leaves with few samples
-  // desc = if set to zero, no smoothing is applied
+  // desc = if ``0.0`` (the default), no smoothing is applied
  // desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
  // desc = larger values give stronger regularization
  // descl2 = the weight of each node is ``w * (n / path_smooth) / (n / path_smooth + 1) + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
@ -580,7 +582,7 @@ struct Config {
  // desc = by default interaction constraints are disabled, to enable them you can specify
  // descl2 = for CLI, lists separated by commas, e.g. ``[0,1,2],[2,3]``
  // descl2 = for Python-package, list of lists, e.g. ``[[0, 1, 2], [2, 3]]``
-  // descl2 = for R-package, list of character or numeric vectors, e.g. ``list(c("var1", "var2", "var3"), c("var3", "var4"))`` or ``list(c(1L, 2L, 3L), c(3L, 4L))``. Numeric vectors should use 1-based indexing, where ``1L`` is the first feature, ``2L`` is the second feature, etc
+  // descl2 = for R-package, list of character or numeric vectors, e.g. ``list(c("var1", "var2", "var3"), c("var3", "var4"))`` or ``list(c(1L, 2L, 3L), c(3L, 4L))``. Numeric vectors should use 1-based indexing, where ``1L`` is the first feature, ``2L`` is the second feature, etc.
  // desc = any two features can only appear in the same branch only if there exists a constraint containing both features
  std::string interaction_constraints = "";

@ -619,24 +621,27 @@ struct Config {
  // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
  // desc = with quantized training, most arithmetics in the training process will be integer operations
  // desc = gradient quantization can accelerate training, with little accuracy drop in most cases
-  // desc = **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+  // desc = **Note**: works only with ``cpu`` and ``cuda`` device type
  // desc = *New in version 4.0.0*
  bool use_quantized_grad = false;

+  // desc = used only if ``use_quantized_grad=true``
  // desc = number of bins to quantization gradients and hessians
  // desc = with more bins, the quantized training will be closer to full precision training
-  // desc = **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+  // desc = **Note**: works only with ``cpu`` and ``cuda`` device type
  // desc = *New in version 4.0.0*
  int num_grad_quant_bins = 4;

+  // desc = used only if ``use_quantized_grad=true``
  // desc = whether to renew the leaf values with original gradients when quantized training
  // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
-  // desc = **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+  // desc = **Note**: works only with ``cpu`` and ``cuda`` device type
  // desc = *New in version 4.0.0*
  bool quant_train_renew_leaf = false;

+  // desc = used only if ``use_quantized_grad=true``
  // desc = whether to use stochastic rounding in gradient quantization
-  // desc = **Note**: can be used only with ``device_type = cpu`` and ``device_type=cuda``
+  // desc = **Note**: works only with ``cpu`` and ``cuda`` device type
  // desc = *New in version 4.0.0*
  bool stochastic_rounding = true;

@ -650,16 +655,16 @@ struct Config {

  // alias = linear_trees
  // desc = fit piecewise linear gradient boosting tree
-  // descl2 = tree splits are chosen in the usual way, but the model at each leaf is linear instead of constant
-  // descl2 = the linear model at each leaf includes all the numerical features in that leaf's branch
-  // descl2 = the first tree has constant leaf values
-  // descl2 = categorical features are used for splits as normal but are not used in the linear models
-  // descl2 = missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R
-  // descl2 = it is recommended to rescale data before training so that features have similar mean and standard deviation
-  // descl2 = **Note**: only works with CPU and ``serial`` tree learner
-  // descl2 = **Note**: ``regression_l1`` objective is not supported with linear tree boosting
-  // descl2 = **Note**: setting ``linear_tree=true`` significantly increases the memory use of LightGBM
-  // descl2 = **Note**: if you specify ``monotone_constraints``, constraints will be enforced when choosing the split points, but not when fitting the linear models on leaves
+  // desc = tree splits are chosen in the usual way, but the model at each leaf is linear instead of constant
+  // desc = the linear model at each leaf includes all the numerical features in that leaf's branch
+  // desc = the first tree has constant leaf values
+  // desc = categorical features are used for splits as normal but are not used in the linear models
+  // desc = missing values should not be encoded as ``0``. Use ``np.nan`` for Python, ``NA`` for the CLI, and ``NA``, ``NA_real_``, or ``NA_integer_`` for R
+  // desc = it is recommended to rescale data before training so that features have similar mean and standard deviation
+  // desc = **Note**: works only with ``cpu`` device type and ``serial`` tree learner
+  // desc = **Note**: ``regression_l1`` objective is not supported with linear tree boosting
+  // desc = **Note**: setting ``linear_tree=true`` significantly increases the memory use of LightGBM
+  // desc = **Note**: if you specify ``monotone_constraints``, constraints will be enforced when choosing the split points, but not when fitting the linear models on leaves
  bool linear_tree = false;

  // alias = max_bins
@ -862,12 +867,12 @@ struct Config {
  bool pred_early_stop = false;

  // [no-save]
-  // desc = used only in ``prediction`` task
+  // desc = used only in ``prediction`` task and if ``pred_early_stop=true``
  // desc = the frequency of checking early-stopping prediction
  int pred_early_stop_freq = 10;

  // [no-save]
-  // desc = used only in ``prediction`` task
+  // desc = used only in ``prediction`` task and if ``pred_early_stop=true``
  // desc = the threshold of margin in early-stopping prediction
  double pred_early_stop_margin = 10.0;

@ -985,7 +990,8 @@ struct Config {
  std::vector<double> label_gain;

  // check = >=0.0
-  // desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled. Larger values reduce the inferred position bias factors.
+  // desc = used only in ``lambdarank`` application when positional information is provided and position bias is modeled
+  // desc = larger values reduce the inferred position bias factors
  // desc = *New in version 4.1.0*
  double lambdarank_position_bias_regularization = 0.0;

@ -1075,7 +1081,7 @@ struct Config {
  // check = >0
  // alias = num_machine
  // desc = the number of machines for distributed learning application
-  // desc = this parameter is needed to be set in both **socket** and **mpi** versions
+  // desc = this parameter is needed to be set in both **socket** and **MPI** versions
  int num_machines = 1;

  // check = >0
@ -1105,23 +1111,24 @@ struct Config {
  #pragma region GPU Parameters
  #endif  // __NVCC__

+  // desc = used only with ``gpu`` device type
  // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
  // desc = ``-1`` means the system-wide default platform
  // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
  int gpu_platform_id = -1;

-  // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+  // desc = OpenCL device ID in the specified platform or CUDA device ID. Each GPU in the selected platform has a unique device ID
  // desc = ``-1`` means the default device in the selected platform
  // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
  int gpu_device_id = -1;

  // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
-  // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
+  // desc = **Note**: can be used only in OpenCL implementation (``device_type="gpu"``), in CUDA implementation only double precision is currently supported
  bool gpu_use_dp = false;

  // check = >0
  // desc = number of GPUs
-  // desc = **Note**: can be used only in CUDA implementation
+  // desc = **Note**: can be used only in CUDA implementation (``device_type="cuda"``)
  int num_gpu = 1;

  #ifndef __NVCC__