From 497e60ed144007d051bcdfc8bdb36d4f2f5ff681 Mon Sep 17 00:00:00 2001
From: Nikita Titov <nekit94-08@mail.ru>
Date: Sat, 19 May 2018 08:01:00 +0300
Subject: [PATCH] [docs] multiple documents improvements (#1378)

* delegated not showing hidden items to the rtd_theme

* refactored docs
---
 .travis/test.sh                  |   2 +-
 docs/Development-Guide.rst       |  54 ++++----
 docs/Experiments.rst             | 218 +++++++++++++++----------------
 docs/GPU-Performance.rst         |  10 +-
 docs/GPU-Targets.rst             |   2 -
 docs/GPU-Windows.rst             |   1 -
 docs/Parallel-Learning-Guide.rst |  34 ++---
 docs/Parameters-Tuning.rst       |   2 +-
 docs/Parameters.rst              |  45 +++----
 docs/Quick-Start.rst             | 111 ++++++++--------
 docs/README.rst                  |   4 +-
 docs/_static/js/rst_links_fix.js |   1 -
 docs/conf.py                     |   4 +-
 13 files changed, 236 insertions(+), 252 deletions(-)

diff --git a/.travis/test.sh b/.travis/test.sh
index 938d202f4..d2715035c 100644
--- a/.travis/test.sh
+++ b/.travis/test.sh
@@ -33,7 +33,7 @@ if [[ ${TASK} == "check-docs" ]]; then
     if [[ ${PYTHON_VERSION} == "2.7" ]]; then
         conda install mock
     fi
-    conda install sphinx sphinx_rtd_theme  # html5validator
+    conda install sphinx "sphinx_rtd_theme>=0.3"  # html5validator
     pip install rstcheck
     cd python-package
     rstcheck --report warning `find . -type f -name "*.rst"` || exit -1
diff --git a/docs/Development-Guide.rst b/docs/Development-Guide.rst
index 50b35b47b..f6255807d 100644
--- a/docs/Development-Guide.rst
+++ b/docs/Development-Guide.rst
@@ -12,33 +12,33 @@ Classes and Code Structure
 Important Classes
 ~~~~~~~~~~~~~~~~~
 
-+-------------------------+--------------------------------------------------------------------------------------+
-| Class                   | Description                                                                          |
-+=========================+======================================================================================+
-| ``Application``         | The entrance of application, including training and prediction logic                 |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Bin``                 | Data structure used for store feature discrete values (converted from float values)  |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Boosting``            | Boosting interface, current implementation is GBDT and DART                          |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Config``              | Store parameters and configurations                                                  |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Dataset``             | Store information of dataset                                                         |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``DatasetLoader``       | Used to construct dataset                                                            |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Feature``             | Store One column feature                                                             |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Metric``              | Evaluation metrics                                                                   |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Network``             | Network interfaces and communication algorithms                                      |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``ObjectiveFunction``   | Objective function used to train                                                     |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``Tree``                | Store information of tree model                                                      |
-+-------------------------+--------------------------------------------------------------------------------------+
-| ``TreeLearner``         | Used to learn trees                                                                  |
-+-------------------------+--------------------------------------------------------------------------------------+
++-------------------------+----------------------------------------------------------------------------------------+
+| Class                   | Description                                                                            |
++=========================+========================================================================================+
+| ``Application``         | The entrance of application, including training and prediction logic                   |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Bin``                 | Data structure used for storing feature discrete values (converted from float values)  |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Boosting``            | Boosting interface (GBDT, DART, GOSS, etc.)                                            |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Config``              | Stores parameters and configurations                                                   |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Dataset``             | Stores information of dataset                                                          |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``DatasetLoader``       | Used to construct dataset                                                              |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Feature``             | Stores one column feature                                                              |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Metric``              | Evaluation metrics                                                                     |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Network``             | Network interfaces and communication algorithms                                        |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``ObjectiveFunction``   | Objective functions used to train                                                      |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``Tree``                | Stores information of tree model                                                       |
++-------------------------+----------------------------------------------------------------------------------------+
+| ``TreeLearner``         | Used to learn trees                                                                    |
++-------------------------+----------------------------------------------------------------------------------------+
 
 Code Structure
 ~~~~~~~~~~~~~~
diff --git a/docs/Experiments.rst b/docs/Experiments.rst
index 207954b08..d11851b58 100644
--- a/docs/Experiments.rst
+++ b/docs/Experiments.rst
@@ -9,39 +9,39 @@ For the detailed experiment scripts and output logs, please refer to this `repo`
 Data
 ^^^^
 
-We use 5 datasets to conduct our comparison experiments. Details of data are listed in the following table:
+We used 5 datasets to conduct our comparison experiments. Details of data are listed in the following table:
 
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
-| **Data**    | **Task**                | **Link**                                                               | **#Train\_Set**   | **#Feature**   | **Comments**                                |
-+=============+=========================+========================================================================+===================+================+=============================================+
-| Higgs       | Binary classification   | `link <https://archive.ics.uci.edu/ml/datasets/HIGGS>`__               | 10,500,000        | 28             | use last 500,000 samples as test set        |
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
-| Yahoo LTR   | Learning to rank        | `link <https://webscope.sandbox.yahoo.com/catalog.php?datatype=c>`__   | 473,134           | 700            | set1.train as train, set1.test as test      |
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
-| MS LTR      | Learning to rank        | `link <http://research.microsoft.com/en-us/projects/mslr/>`__          | 2,270,296         | 137            | {S1,S2,S3} as train set, {S5} as test set   |
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
-| Expo        | Binary classification   | `link <http://stat-computing.org/dataexpo/2009/>`__                    | 11,000,000        | 700            | use last 1,000,000 as test set              |
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
-| Allstate    | Binary classification   | `link <https://www.kaggle.com/c/ClaimPredictionChallenge>`__           | 13,184,290        | 4228           | use last 1,000,000 as test set              |
-+-------------+-------------------------+------------------------------------------------------------------------+-------------------+----------------+---------------------------------------------+
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Data      | Task                  | Link                                                                   | #Train\_Set | #Feature | Comments                                     |
++===========+=======================+========================================================================+=============+==========+==============================================+
+| Higgs     | Binary classification | `link <https://archive.ics.uci.edu/ml/datasets/HIGGS>`__               | 10,500,000  | 28       | last 500,000 samples were used as test set   |
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Yahoo LTR | Learning to rank      | `link <https://webscope.sandbox.yahoo.com/catalog.php?datatype=c>`__   | 473,134     | 700      | set1.train as train, set1.test as test       |
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| MS LTR    | Learning to rank      | `link <http://research.microsoft.com/en-us/projects/mslr/>`__          | 2,270,296   | 137      | {S1,S2,S3} as train set, {S5} as test set    |
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Expo      | Binary classification | `link <http://stat-computing.org/dataexpo/2009/>`__                    | 11,000,000  | 700      | last 1,000,000 samples were used as test set |
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Allstate  | Binary classification | `link <https://www.kaggle.com/c/ClaimPredictionChallenge>`__           | 13,184,290  | 4228     | last 1,000,000 samples were used as test set |
++-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
 
 Environment
 ^^^^^^^^^^^
 
-We use one Linux server as experiment platform, details are listed in the following table:
+We used one Linux server as experiment platform, details are listed in the following table:
 
-+--------------------+-------------------+-----------------------+
-| **OS**             | **CPU**           | **Memory**            |
-+====================+===================+=======================+
-| Ubuntu 14.04 LTS   | 2 \* E5-2670 v3   | DDR4 2133Mhz, 256GB   |
-+--------------------+-------------------+-----------------------+
++------------------+-----------------+---------------------+
+| OS               | CPU             | Memory              |
++==================+=================+=====================+
+| Ubuntu 14.04 LTS | 2 \* E5-2670 v3 | DDR4 2133Mhz, 256GB |
++------------------+-----------------+---------------------+
 
 Baseline
 ^^^^^^^^
 
-We use `xgboost`_ as a baseline.
+We used `xgboost`_ as a baseline.
 
-Both xgboost and LightGBM are built with OpenMP support.
+Both xgboost and LightGBM were built with OpenMP support.
 
 Settings
 ^^^^^^^^
@@ -96,75 +96,76 @@ Result
 Speed
 '''''
 
-For speed comparison, we only run the training task, which is without any test or metric output. And we don't count the time for IO.
+For speed comparison, we only run the training task, which was without any test or metric output. And we didn't count the time for IO.
 
 The following table is the comparison of time cost:
 
-+-------------+---------------+---------------------+------------------+
-| **Data**    | **xgboost**   | **xgboost\_hist**   | **LightGBM**     |
-+=============+===============+=====================+==================+
-| Higgs       | 3794.34 s     | 551.898 s           | **238.505513 s** |
-+-------------+---------------+---------------------+------------------+
-| Yahoo LTR   | 674.322 s     | 265.302 s           | **150.18644 s**  |
-+-------------+---------------+---------------------+------------------+
-| MS LTR      | 1251.27 s     | 385.201 s           | **215.320316 s** |
-+-------------+---------------+---------------------+------------------+
-| Expo        | 1607.35 s     | 588.253 s           | **138.504179 s** |
-+-------------+---------------+---------------------+------------------+
-| Allstate    | 2867.22 s     | 1355.71 s           | **348.084475 s** |
-+-------------+---------------+---------------------+------------------+
++-----------+-----------+---------------+------------------+
+| Data      | xgboost   | xgboost\_hist | LightGBM         |
++===========+===========+===============+==================+
+| Higgs     | 3794.34 s | 551.898 s     | **238.505513 s** |
++-----------+-----------+---------------+------------------+
+| Yahoo LTR | 674.322 s | 265.302 s     | **150.18644 s**  |
++-----------+-----------+---------------+------------------+
+| MS LTR    | 1251.27 s | 385.201 s     | **215.320316 s** |
++-----------+-----------+---------------+------------------+
+| Expo      | 1607.35 s | 588.253 s     | **138.504179 s** |
++-----------+-----------+---------------+------------------+
+| Allstate  | 2867.22 s | 1355.71 s     | **348.084475 s** |
++-----------+-----------+---------------+------------------+
 
 We found LightGBM is faster than xgboost on all experiment data sets.
 
 Accuracy
 ''''''''
 
-For accuracy comparison, we use the accuracy on test data set to have a fair comparison.
+For accuracy comparison, we used the accuracy on test data set to have a fair comparison.
 
-+-------------+-----------------+---------------+---------------------+----------------+
-| **Data**    | **Metric**      | **xgboost**   | **xgboost\_hist**   | **LightGBM**   |
-+=============+=================+===============+=====================+================+
-| Higgs       | AUC             | 0.839593      | 0.845605            | 0.845154       |
-+-------------+-----------------+---------------+---------------------+----------------+
-| Yahoo LTR   | NDCG\ :sub:`1`  | 0.719748      | 0.720223            | 0.732466       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`3`  | 0.717813      | 0.721519            | 0.738048       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`5`  | 0.737849      | 0.739904            | 0.756548       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`10` | 0.78089       | 0.783013            | 0.796818       |
-+-------------+-----------------+---------------+---------------------+----------------+
-| MS LTR      | NDCG\ :sub:`1`  | 0.483956      | 0.488649            | 0.524255       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`3`  | 0.467951      | 0.473184            | 0.505327       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`5`  | 0.472476      | 0.477438            | 0.510007       |
-|             +-----------------+---------------+---------------------+----------------+
-|             | NDCG\ :sub:`10` | 0.492429      | 0.496967            | 0.527371       |
-+-------------+-----------------+---------------+---------------------+----------------+
-| Expo        | AUC             | 0.756713      | 0.777777            | 0.777543       |
-+-------------+-----------------+---------------+---------------------+----------------+
-| Allstate    | AUC             | 0.607201      | 0.609042            | 0.609167       |
-+-------------+-----------------+---------------+---------------------+----------------+
++-----------+-----------------+----------+---------------+----------+
+| Data      | Metric          | xgboost  | xgboost\_hist | LightGBM |
++===========+=================+==========+===============+==========+
+| Higgs     | AUC             | 0.839593 | 0.845605      | 0.845154 |
++-----------+-----------------+----------+---------------+----------+
+| Yahoo LTR | NDCG\ :sub:`1`  | 0.719748 | 0.720223      | 0.732466 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`3`  | 0.717813 | 0.721519      | 0.738048 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`5`  | 0.737849 | 0.739904      | 0.756548 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`10` | 0.78089  | 0.783013      | 0.796818 |
++-----------+-----------------+----------+---------------+----------+
+| MS LTR    | NDCG\ :sub:`1`  | 0.483956 | 0.488649      | 0.524255 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`3`  | 0.467951 | 0.473184      | 0.505327 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`5`  | 0.472476 | 0.477438      | 0.510007 |
+|           +-----------------+----------+---------------+----------+
+|           | NDCG\ :sub:`10` | 0.492429 | 0.496967      | 0.527371 |
++-----------+-----------------+----------+---------------+----------+
+| Expo      | AUC             | 0.756713 | 0.777777      | 0.777543 |
++-----------+-----------------+----------+---------------+----------+
+| Allstate  | AUC             | 0.607201 | 0.609042      | 0.609167 |
++-----------+-----------------+----------+---------------+----------+
 
 Memory Consumption
 ''''''''''''''''''
 
-We monitor RES while running training task. And we set ``two_round=true`` (will increase data-loading time, but reduce peak memory usage, not affect training speed or accuracy) in LightGBM to reduce peak memory usage.
+We monitored RES while running training task. And we set ``two_round=true`` (will increase data-loading time,
+but reduce peak memory usage, not affect training speed or accuracy) in LightGBM to reduce peak memory usage.
 
-+-------------+---------------+---------------------+----------------+
-| **Data**    | **xgboost**   | **xgboost\_hist**   | **LightGBM**   |
-+=============+===============+=====================+================+
-| Higgs       | 4.853GB       | 3.784GB             | **0.868GB**    |
-+-------------+---------------+---------------------+----------------+
-| Yahoo LTR   | 1.907GB       | 1.468GB             | **0.831GB**    |
-+-------------+---------------+---------------------+----------------+
-| MS LTR      | 5.469GB       | 3.654GB             | **0.886GB**    |
-+-------------+---------------+---------------------+----------------+
-| Expo        | 1.553GB       | 1.393GB             | **0.543GB**    |
-+-------------+---------------+---------------------+----------------+
-| Allstate    | 6.237GB       | 4.990GB             | **1.027GB**    |
-+-------------+---------------+---------------------+----------------+
++-----------+---------+---------------+-------------+
+| Data      | xgboost | xgboost\_hist | LightGBM    |
++===========+=========+===============+=============+
+| Higgs     | 4.853GB | 3.784GB       | **0.868GB** |
++-----------+---------+---------------+-------------+
+| Yahoo LTR | 1.907GB | 1.468GB       | **0.831GB** |
++-----------+---------+---------------+-------------+
+| MS LTR    | 5.469GB | 3.654GB       | **0.886GB** |
++-----------+---------+---------------+-------------+
+| Expo      | 1.553GB | 1.393GB       | **0.543GB** |
++-----------+---------+---------------+-------------+
+| Allstate  | 6.237GB | 4.990GB       | **1.027GB** |
++-----------+---------+---------------+-------------+
 
 Parallel Experiment
 -------------------
@@ -172,30 +173,29 @@ Parallel Experiment
 Data
 ^^^^
 
-We use a terabyte click log dataset to conduct parallel experiments. Details are listed in following table:
+We used a terabyte click log dataset to conduct parallel experiments. Details are listed in following table:
 
-+------------+-------------------------+------------+-----------------+----------------+
-| **Data**   | **Task**                | **Link**   | **#Data**       | **#Feature**   |
-+============+=========================+============+=================+================+
-| Criteo     | Binary classification   | `link`_    | 1,700,000,000   | 67             |
-+------------+-------------------------+------------+-----------------+----------------+
++--------+-----------------------+---------+---------------+----------+
+| Data   | Task                  | Link    | #Data         | #Feature |
++========+=======================+=========+===============+==========+
+| Criteo | Binary classification | `link`_ | 1,700,000,000 | 67       |
++--------+-----------------------+---------+---------------+----------+
 
 This data contains 13 integer features and 26 category features of 24 days click log.
-We statistic the CTR and count for these 26 category features from the first ten days,
-then use next ten days' data, which had been replaced the category features by the corresponding CTR and count, as training data.
+We statisticized the CTR and count for these 26 category features from the first ten days,
+then used next ten days' data, which had been replaced the category features by the corresponding CTR and count, as training data.
 The processed training data have a total of 1.7 billions records and 67 features.
 
 Environment
 ^^^^^^^^^^^
 
-We use 16 Windows servers as experiment platform, details are listed in following table:
+We used 16 Windows servers as experiment platform, details are listed in following table:
 
-+----------------------+-----------------+----------------------+-------------------------------+
-| **OS**               | **CPU**         | **Memory**           | **Network Adapter**           |
-+======================+=================+======================+===============================+
-| Windows Server 2012  | 2 * E5-2670 v2  | DDR3 1600Mhz, 256GB  | Mellanox ConnectX-3, 54Gbps,  |
-|                      |                 |                      | RDMA support                  |
-+----------------------+-----------------+----------------------+-------------------------------+
++---------------------+-----------------+---------------------+-------------------------------------------+
+| OS                  | CPU             | Memory              | Network Adapter                           |
++=====================+=================+=====================+===========================================+
+| Windows Server 2012 | 2 \* E5-2670 v2 | DDR3 1600Mhz, 256GB | Mellanox ConnectX-3, 54Gbps, RDMA support |
++---------------------+-----------------+---------------------+-------------------------------------------+
 
 Settings
 ^^^^^^^^
@@ -208,28 +208,28 @@ Settings
     num_thread = 16
     tree_learner = data
 
-We use data parallel here, since this data is large in ``#data`` but small in ``#feature``.
+We used data parallel here, since this data is large in ``#data`` but small in ``#feature``.
 
-Other parameters are default values.
+Other parameters were default values.
 
-Result
-^^^^^^
+Results
+^^^^^^^
 
-+----------------+---------------------+---------------------------------+
-| **#Machine**   | **Time per Tree**   | **Memory Usage(per Machine)**   |
-+================+=====================+=================================+
-| 1              | 627.8 s             | 176GB                           |
-+----------------+---------------------+---------------------------------+
-| 2              | 311 s               | 87GB                            |
-+----------------+---------------------+---------------------------------+
-| 4              | 156 s               | 43GB                            |
-+----------------+---------------------+---------------------------------+
-| 8              | 80 s                | 22GB                            |
-+----------------+---------------------+---------------------------------+
-| 16             | 42 s                | 11GB                            |
-+----------------+---------------------+---------------------------------+
++----------+---------------+---------------------------+
+| #Machine | Time per Tree | Memory Usage(per Machine) |
++==========+===============+===========================+
+| 1        | 627.8 s       | 176GB                     |
++----------+---------------+---------------------------+
+| 2        | 311 s         | 87GB                      |
++----------+---------------+---------------------------+
+| 4        | 156 s         | 43GB                      |
++----------+---------------+---------------------------+
+| 8        | 80 s          | 22GB                      |
++----------+---------------+---------------------------+
+| 16       | 42 s          | 11GB                      |
++----------+---------------+---------------------------+
 
-From the results, we find that LightGBM performs linear speed up in parallel learning.
+From the results, we found that LightGBM performs linear speed up in parallel learning.
 
 GPU Experiments
 ---------------
diff --git a/docs/GPU-Performance.rst b/docs/GPU-Performance.rst
index 3b2d6bea4..89b8e5b22 100644
--- a/docs/GPU-Performance.rst
+++ b/docs/GPU-Performance.rst
@@ -15,11 +15,13 @@ We target AMD Graphics Core Next (GCN) architecture and NVIDIA Maxwell and Pasca
 Most AMD GPUs released after 2012 and NVIDIA GPUs released after 2014 should be supported. We have tested the GPU implementation on the following GPUs:
 
 -  AMD RX 480 with AMDGPU-pro driver 16.60 on Ubuntu 16.10
--  AMD R9 280X (aka Radeon HD 7970) with fglrx driver 15.302.2301 on
-   Ubuntu 16.10
+
+-  AMD R9 280X (aka Radeon HD 7970) with fglrx driver 15.302.2301 on Ubuntu 16.10
+
 -  NVIDIA GTX 1080 with driver 375.39 and CUDA 8.0 on Ubuntu 16.10
--  NVIDIA Titan X (Pascal) with driver 367.48 and CUDA 8.0 on Ubuntu
-   16.04
+
+-  NVIDIA Titan X (Pascal) with driver 367.48 and CUDA 8.0 on Ubuntu 16.04
+
 -  NVIDIA Tesla M40 with driver 375.39 and CUDA 7.5 on Ubuntu 16.04
 
 Using the following hardware is discouraged:
diff --git a/docs/GPU-Targets.rst b/docs/GPU-Targets.rst
index 5f3d18989..213a144c1 100644
--- a/docs/GPU-Targets.rst
+++ b/docs/GPU-Targets.rst
@@ -293,5 +293,3 @@ Keep in mind that using the integrated graphics card is not directly possible wi
 .. _clinfo: https://github.com/Oblomov/clinfo
 
 .. _GPUCapsViewer: http://www.ozone3d.net/gpu_caps_viewer/
-
-
diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst
index 8bb316038..a8eab56f6 100644
--- a/docs/GPU-Windows.rst
+++ b/docs/GPU-Windows.rst
@@ -583,4 +583,3 @@ And open an issue in GitHub `here`_ with that log.
 .. _here: https://github.com/Microsoft/LightGBM/issues
 
 .. _GPUCapsViewer: http://www.ozone3d.net/gpu_caps_viewer/
-
diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst
index 1a5a89543..791023886 100644
--- a/docs/Parallel-Learning-Guide.rst
+++ b/docs/Parallel-Learning-Guide.rst
@@ -8,27 +8,27 @@ Follow the `Quick Start <./Quick-Start.rst>`__ to know how to use LightGBM first
 Choose Appropriate Parallel Algorithm
 -------------------------------------
 
-LightGBM provides 2 parallel learning algorithms now.
+LightGBM provides 3 parallel learning algorithms now.
 
-+--------------------------+---------------------------+
-| **Parallel Algorithm**   | **How to Use**            |
-+==========================+===========================+
-| Data parallel            | ``tree_learner=data``     |
-+--------------------------+---------------------------+
-| Feature parallel         | ``tree_learner=feature``  |
-+--------------------------+---------------------------+
-| Voting parallel          | ``tree_learner=voting``   |
-+--------------------------+---------------------------+
++--------------------+---------------------------+
+| Parallel Algorithm | How to Use                |
++====================+===========================+
+| Data parallel      | ``tree_learner=data``     |
++--------------------+---------------------------+
+| Feature parallel   | ``tree_learner=feature``  |
++--------------------+---------------------------+
+| Voting parallel    | ``tree_learner=voting``   |
++--------------------+---------------------------+
 
 These algorithms are suited for different scenarios, which is listed in the following table:
 
-+-------------------------+----------------------+----------------------+
-|                         | **#data is small**   | **#data is large**   |
-+=========================+======================+======================+
-| **#feature is small**   | Feature Parallel     | Data Parallel        |
-+-------------------------+----------------------+----------------------+
-| **#feature is large**   | Feature Parallel     | Voting Parallel      |
-+-------------------------+----------------------+----------------------+
++-------------------------+-------------------+-----------------+
+|                         | #data is small    | #data is large  |
++=========================+===================+=================+
+| **#feature is small**   | Feature Parallel  | Data Parallel   |
++-------------------------+-------------------+-----------------+
+| **#feature is large**   | Feature Parallel  | Voting Parallel |
++-------------------------+-------------------+-----------------+
 
 More details about these parallel algorithms can be found in `optimization in parallel learning <./Features.rst#optimization-in-parallel-learning>`__.
 
diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
index 38afbcd21..7f416ad43 100644
--- a/docs/Parameters-Tuning.rst
+++ b/docs/Parameters-Tuning.rst
@@ -1,7 +1,7 @@
 Parameters Tuning
 =================
 
-This is a page contains all parameters in LightGBM.
+This page contains parameters tuning guides for different scenarios.
 
 **List of other helpful links**
 
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 9d5a8a519..de83ecab7 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -1,7 +1,7 @@
 Parameters
 ==========
 
-This page contains all parameters in LightGBM.
+This page contains descriptions of all parameters in LightGBM.
 
 **List of other helpful links**
 
@@ -13,20 +13,11 @@ This page contains all parameters in LightGBM.
 
 - `Laurae++ Interactive Documentation`_
 
-**Update of 08/04/2017**
-
-Default values for the following parameters have changed:
-
--  ``min_data_in_leaf`` = 100 => 20
--  ``min_sum_hessian_in_leaf`` = 10 => 1e-3
--  ``num_leaves`` = 127 => 31
--  ``num_iterations`` = 10 => 100
-
 Parameters Format
 -----------------
 
 The parameters format is ``key1=value1 key2=value2 ...``.
-And parameters can be set both in config file and command line.
+Parameters can be set both in config file and command line.
 By using command line, parameters should not have spaces before and after ``=``.
 By using config files, one line can only contain one parameter. You can use ``#`` to comment.
 
@@ -39,19 +30,19 @@ Core Parameters
 
    -  path of config file
 
-   - **Note**: Only can be used in CLI version.
+   -  **Note**: Only can be used in CLI version
 
 -  ``task``, default=\ ``train``, type=enum, options=\ ``train``, ``predict``, ``convert_model``, ``refit``
 
    -  ``train``, alias=\ ``training``, for training
 
-   -  ``predict``, alias=\ ``prediction``, ``test``, for prediction.
+   -  ``predict``, alias=\ ``prediction``, ``test``, for prediction
 
    -  ``convert_model``, for converting model file into if-else format, see more information in `Convert model parameters <#convert-model-parameters>`__
 
-   -  ``refit``, alias=\ ``refit_tree``, refit existing models with new data.
+   -  ``refit``, alias=\ ``refit_tree``, refit existing models with new data
 
-   - **Note**: Only can be used in CLI version.
+   -  **Note**: Only can be used in CLI version
 
 -  ``application``, default=\ ``regression``, type=enum,
    options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``, ``gammma``, ``tweedie``,
@@ -522,20 +513,20 @@ IO Parameters
 
 -  ``forced_splits``, default=\ ``""``, type=string
 
-   -  path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences.
+   -  path to a ``.json`` file that specifies splits to force at the top of every decision tree before best-first learning commences
 
    -  ``.json`` file can be arbitrarily nested, and each split contains ``feature``, ``threshold`` fields, as well as ``left`` and ``right`` 
       fields representing subsplits. Categorical splits are forced in a one-hot fashion, with ``left`` representing the split containing
-      the feature value and ``right`` representing other values.
+      the feature value and ``right`` representing other values
 
-   -  see `this file <https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example.
+   -  see `this file <https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
 
 Objective Parameters
 --------------------
 
 -  ``sigmoid``, default=\ ``1.0``, type=double
 
-   -  parameter for sigmoid function. Will be used in ``binary`` classification and ``lambdarank``
+   -  parameter for sigmoid function. Will be used in ``binary`` and ``multiclassova`` classification and in ``lambdarank``
 
 -  ``alpha``, default=\ ``0.9``, type=double
 
@@ -699,7 +690,7 @@ GPU Parameters
 
 -  ``gpu_platform_id``, default=\ ``-1``, type=int
 
-   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform.
+   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
 
    -  default value is ``-1``, means the system-wide default platform
 
@@ -762,11 +753,10 @@ LightGBM supports weighted training. It uses an additional file to store weight
 
 It means the weight of the first data row is ``1.0``, second is ``0.5``, and so on.
 The weight file corresponds with data file line by line, and has per weight per line.
-And if the name of data file is ``train.txt``, the weight file should be named as ``train.txt.weight`` and in the same folder as the data file.
-In this case LightGBM will auto load weight file if it exists.
+And if the name of data file is ``train.txt``, the weight file should be named as ``train.txt.weight`` and placed in the same folder as the data file.
+In this case LightGBM will load the weight file automatically if it exists.
 
-**update**:
-You can specific weight column in data file now. Please refer to parameter ``weight`` in above.
+Also, you can include weight column in your data file. Please refer to parameter ``weight`` in above.
 
 Query Data
 ~~~~~~~~~~
@@ -781,15 +771,14 @@ LightGBM uses an additional file to store query data, like the following:
     67
     ...
 
-It means first ``27`` lines samples belong one query and next ``18`` lines belong to another, and so on.
+It means first ``27`` lines samples belong to one query and next ``18`` lines belong to another, and so on.
 
 **Note**: data should be ordered by the query.
 
-If the name of data file is ``train.txt``, the query file should be named as ``train.txt.query`` and in same folder of training data.
+If the name of data file is ``train.txt``, the query file should be named as ``train.txt.query`` and placed in the same folder as the data file.
 In this case LightGBM will load the query file automatically if it exists.
 
-**update**:
-You can specific query/group id in data file now. Please refer to parameter ``group`` in above.
+Also, you can include query/group id column in your data file. Please refer to parameter ``group`` in above.
 
 .. _Laurae++ Interactive Documentation: https://sites.google.com/view/lauraepp/parameters
 
diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst
index aa4085111..539a37027 100644
--- a/docs/Quick-Start.rst
+++ b/docs/Quick-Start.rst
@@ -18,58 +18,53 @@ Follow the `Installation Guide <./Installation-Guide.rst>`__ to install LightGBM
 Training Data Format
 --------------------
 
-LightGBM supports input data file with `CSV`_, `TSV`_ and `LibSVM`_ formats.
+LightGBM supports input data files with `CSV`_, `TSV`_ and `LibSVM`_ formats.
 
-Label is the data of first column, and there is no header in the file.
+Files could be both with and without headers.
+
+Label column could be specified both by index and by name.
+
+Some columns could be ignored.
 
 Categorical Feature Support
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-update 12/5/2016:
-
-LightGBM can use categorical feature directly (without one-hot coding).
-The experiment on `Expo data`_ shows about 8x speed-up compared with one-hot coding.
+LightGBM can use categorical features directly (without one-hot coding).
+The experiment on `Expo data`_ shows about 8x speed-up compared with one-hot encoding.
 
 For the setting details, please refer to `Parameters <./Parameters.rst>`__.
 
 Weight and Query/Group Data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-LightGBM also support weighted training, it needs an additional `weight data <./Parameters.rst#io-parameters>`__.
+LightGBM also supports weighted training, it needs an additional `weight data <./Parameters.rst#io-parameters>`__.
 And it needs an additional `query data <./Parameters.rst#io-parameters>`_ for ranking task.
 
-update 11/3/2016:
-
-1. support input with header now
-
-2. can specific label column, weight column and query/group id column.
-   Both index and column are supported
-
-3. can specific a list of ignored columns
+Also, weight and query data could be specified as columns in training data in the same manner as label.
 
 Parameter Quick Look
 --------------------
 
 The parameter format is ``key1=value1 key2=value2 ...``.
-And parameters can be in both config file and command line.
+Parameters can be set both in config file and command line.
 
 Some important parameters:
 
-- ``config``, default=\ ``""``, type=string, alias=\ ``config_file``
+-  ``config``, default=\ ``""``, type=string, alias=\ ``config_file``
 
-  - path to config file
+   -  path to config file
 
 -  ``task``, default=\ ``train``, type=enum, options=\ ``train``, ``predict``, ``convert_model``
 
    -  ``train``, alias=\ ``training``, for training
 
-   -  ``predict``, alias=\ ``prediction``, ``test``, for prediction.
+   -  ``predict``, alias=\ ``prediction``, ``test``, for prediction
 
    -  ``convert_model``, for converting model file into if-else format, see more information in `Convert model parameters <./Parameters.rst#convert-model-parameters>`__
 
 -  ``application``, default=\ ``regression``, type=enum,
-   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``,
-   ``binary``, ``multiclass``, ``multiclassova``, ``xentropy``, ``xentlambda``, ``lambdarank``, ``gammma``, ``tweedie``,
+   options=\ ``regression``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``mape``, ``gammma``, ``tweedie``,
+   ``binary``, ``multiclass``, ``multiclassova``, ``xentropy``, ``xentlambda``, ``lambdarank``,
    alias=\ ``objective``, ``app``
 
    -  regression application
@@ -88,9 +83,9 @@ Some important parameters:
 
       -  ``mape``, `MAPE loss`_, alias=\ ``mean_absolute_percentage_error``
 
-      -  ``gamma``, gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed`_
+      -  ``gamma``, Gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed`_
 
-      -  ``tweedie``, tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed`_.
+      -  ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed`_
 
    -  ``binary``, binary `log loss`_ classification application
 
@@ -118,40 +113,40 @@ Some important parameters:
 
       -  all values in ``label`` must be smaller than number of elements in ``label_gain``
 
-- ``boosting``, default=\ ``gbdt``, type=enum,
-  options=\ ``gbdt``, ``rf``, ``dart``, ``goss``,
-  alias=\ ``boost``, ``boosting_type``
+-  ``boosting``, default=\ ``gbdt``, type=enum,
+   options=\ ``gbdt``, ``rf``, ``dart``, ``goss``,
+   alias=\ ``boost``, ``boosting_type``
 
-  - ``gbdt``, traditional Gradient Boosting Decision Tree
+   -  ``gbdt``, traditional Gradient Boosting Decision Tree
 
-  - ``rf``, Random Forest
+   -  ``rf``, Random Forest
 
-  - ``dart``, `Dropouts meet Multiple Additive Regression Trees`_
+   -  ``dart``, `Dropouts meet Multiple Additive Regression Trees`_
 
-  - ``goss``, Gradient-based One-Side Sampling
+   -  ``goss``, Gradient-based One-Side Sampling
 
-- ``data``, default=\ ``""``, type=string, alias=\ ``train``, ``train_data``
+-  ``data``, default=\ ``""``, type=string, alias=\ ``train``, ``train_data``
 
-  - training data, LightGBM will train from this data
+   -  training data, LightGBM will train from this data
 
-- ``valid``, default=\ ``""``, type=multi-string, alias=\ ``test``, ``valid_data``, ``test_data``
+-  ``valid``, default=\ ``""``, type=multi-string, alias=\ ``test``, ``valid_data``, ``test_data``
 
-  - validation/test data, LightGBM will output metrics for these data
+   -  validation/test data, LightGBM will output metrics for these data
 
-  - support multi validation data, separate by ``,``
+   -  support multi validation data, separate by ``,``
 
-- ``num_iterations``, default=\ ``100``, type=int,
-  alias=\ ``num_iteration``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``, ``num_boost_round``
+-  ``num_iterations``, default=\ ``100``, type=int,
+   alias=\ ``num_iteration``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``, ``num_boost_round``, ``n_estimators``
 
-  - number of boosting iterations/trees
+   -  number of boosting iterations
 
-- ``learning_rate``, default=\ ``0.1``, type=double, alias=\ ``shrinkage_rate``
+-  ``learning_rate``, default=\ ``0.1``, type=double, alias=\ ``shrinkage_rate``
 
-  - shrinkage rate
+   -  shrinkage rate
 
-- ``num_leaves``, default=\ ``31``, type=int, alias=\ ``num_leaf``
+-  ``num_leaves``, default=\ ``31``, type=int, alias=\ ``num_leaf``
 
-  - number of leaves in one tree
+   -  number of leaves in one tree
 
 -  ``tree_learner``, default=\ ``serial``, type=enum, options=\ ``serial``, ``feature``, ``data``, ``voting``, alias=\ ``tree``
 
@@ -165,31 +160,31 @@ Some important parameters:
 
    -  refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
 
-- ``num_threads``, default=\ ``OpenMP_default``, type=int, alias=\ ``num_thread``, ``nthread``
+-  ``num_threads``, default=\ ``OpenMP_default``, type=int, alias=\ ``num_thread``, ``nthread``
 
-  - number of threads for LightGBM
+   -  number of threads for LightGBM
 
-  - for the best speed, set this to the number of **real CPU cores**,
-    not the number of threads (most CPU using `hyper-threading`_ to generate 2 threads per CPU core)
+   -  for the best speed, set this to the number of **real CPU cores**,
+      not the number of threads (most CPU using `hyper-threading`_ to generate 2 threads per CPU core)
 
-  - for parallel learning, should not use full CPU cores since this will cause poor performance for the network
+   -  for parallel learning, should not use full CPU cores since this will cause poor performance for the network
 
-- ``max_depth``, default=\ ``-1``, type=int
+-  ``max_depth``, default=\ ``-1``, type=int
 
-  - limit the max depth for tree model.
-    This is used to deal with overfit when ``#data`` is small.
-    Tree still grow by leaf-wise
+   -  limit the max depth for tree model.
+      This is used to deal with over-fitting when ``#data`` is small.
+      Tree still grows by leaf-wise
 
-  - ``< 0`` means no limit
+   -  ``< 0`` means no limit
 
-- ``min_data_in_leaf``, default=\ ``20``, type=int, alias=\ ``min_data_per_leaf`` , ``min_data``, ``min_child_samples``
+-  ``min_data_in_leaf``, default=\ ``20``, type=int, alias=\ ``min_data_per_leaf`` , ``min_data``, ``min_child_samples``
 
-  - minimal number of data in one leaf. Can use this to deal with over-fitting
+   -  minimal number of data in one leaf. Can be used this to deal with over-fitting
 
-- ``min_sum_hessian_in_leaf``, default=\ ``1e-3``, type=double,
-  alias=\ ``min_sum_hessian_per_leaf``, ``min_sum_hessian``, ``min_hessian``, ``min_child_weight``
+-  ``min_sum_hessian_in_leaf``, default=\ ``1e-3``, type=double,
+   alias=\ ``min_sum_hessian_per_leaf``, ``min_sum_hessian``, ``min_hessian``, ``min_child_weight``
 
-  - minimal sum hessian in one leaf. Like ``min_data_in_leaf``, it can be used to deal with over-fitting
+   -  minimal sum hessian in one leaf. Like ``min_data_in_leaf``, it can be used to deal with over-fitting
 
 For all parameters, please refer to `Parameters <./Parameters.rst>`__.
 
@@ -208,7 +203,7 @@ For Unix:
 
     ./lightgbm config=your_config_file other_args ...
 
-Parameters can be both in the config file and command line, and the parameters in command line have higher priority than in config file.
+Parameters can be set both in config file and command line, and the parameters in command line have higher priority than in config file.
 For example, following command line will keep ``num_trees=10`` and ignore the same parameter in config file.
 
 ::
diff --git a/docs/README.rst b/docs/README.rst
index 0f7843d2d..068b0aaaf 100644
--- a/docs/README.rst
+++ b/docs/README.rst
@@ -14,7 +14,7 @@ for Python 3.x:
 
 .. code:: sh
 
-    pip install sphinx sphinx_rtd_theme
+    pip install sphinx "sphinx_rtd_theme>=0.3"
     make html
 
  
@@ -22,5 +22,5 @@ for Python 2.x:
 
 .. code:: sh
 
-    pip install mock sphinx sphinx_rtd_theme
+    pip install mock sphinx "sphinx_rtd_theme>=0.3"
     make html
diff --git a/docs/_static/js/rst_links_fix.js b/docs/_static/js/rst_links_fix.js
index 4ec8ec0e5..9fab7c8e2 100644
--- a/docs/_static/js/rst_links_fix.js
+++ b/docs/_static/js/rst_links_fix.js
@@ -1,5 +1,4 @@
 $(function() {
     $('a[href^="./"][href*=".rst"]').attr('href', (i, val) => { return val.replace('.rst', '.html'); });  /* Replace '.rst' with '.html' in all internal links like './[Something].rst[#anchor]' */
     $('.wy-nav-content').each(function () { this.style.setProperty('max-width', 'none', 'important'); });
-    $('.wy-menu.wy-menu-vertical > ul:nth-of-type(2)').hide();  /* Fix theme navbar shows hidden toctree */
 });
diff --git a/docs/conf.py b/docs/conf.py
index 1d7d94e42..c205e924c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -115,7 +115,9 @@ html_theme = 'sphinx_rtd_theme'
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'includehidden': False,
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,