diff --git a/.travis/test.sh b/.travis/test.sh
index 40ab63bbd..1b692f044 100644
--- a/.travis/test.sh
+++ b/.travis/test.sh
@@ -28,13 +28,12 @@ cd $TRAVIS_BUILD_DIR
 if [[ ${TASK} == "check-docs" ]]; then
     cd docs
     sudo apt-get install linkchecker
-    pip install rstcheck  # html5validator
-    pip install -r requirements.txt
+    pip install rstcheck sphinx sphinx_rtd_theme  # html5validator
     rstcheck --report warning --ignore-directives=autoclass,autofunction `find . -type f -name "*.rst"` || exit -1
     make html || exit -1
     find ./_build/html/ -type f -name '*.html' -exec \
-    sed -i -e 's#\(\.\/[^.]*\.\)\(md\|rst\)#\1html#g' {} \;  # Emulate js function
-#    html5validator --root ./_build/html/ || exit -1  For future (Sphinx 1.6) usage
+    sed -i -e 's;\(\.\/[^.]*\.\)rst\([^[:space:]]*\);\1html\2;g' {} \;  # Emulate js function
+#    html5validator --root ./_build/html/ || exit -1
     linkchecker --config=.linkcheckerrc ./_build/html/*.html || exit -1
     exit 0
 fi
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index d85e6340c..6d73048b0 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -57,7 +57,7 @@ CVBooster <- R6Class(
 #'        If early stopping occurs, the model will have 'best_iter' field
 #' @param callbacks list of callback functions
 #'        List of callback functions that are applied at each iteration.
-#' @param ... other parameters, see parameters.md for more informations
+#' @param ... other parameters, see Parameters.rst for more informations
 #' 
 #' @return a trained model \code{lgb.CVBooster}.
 #' 
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index a1fa2d8b9..1efd677f2 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -30,7 +30,7 @@
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
 #' @param callbacks list of callback functions
 #'        List of callback functions that are applied at each iteration.
-#' @param ... other parameters, see parameters.md for more informations
+#' @param ... other parameters, see Parameters.rst for more informations
 #' 
 #' @return a trained booster model \code{lgb.Booster}.
 #' 
diff --git a/README.md b/README.md
index 23bcfd950..a7d031c8e 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ LightGBM is a gradient boosting framework that uses tree based learning algorith
 - Parallel and GPU learning supported
 - Capable of handling large-scale data
 
-For more details, please refer to [Features](https://github.com/Microsoft/LightGBM/blob/master/docs/Features.md).
+For more details, please refer to [Features](https://github.com/Microsoft/LightGBM/blob/master/docs/Features.rst).
 
 [Comparison experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#comparison-experiment) on public datasets show that LightGBM can outperform existing boosting frameworks on both efficiency and accuracy, with significantly lower memory consumption. What's more, the [parallel experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#parallel-experiment) show that LightGBM can achieve a linear speed-up by using multiple machines for training in specific settings.
 
@@ -36,7 +36,7 @@ News
 
 05/03/2017 : LightGBM v2 stable release.
 
-04/10/2017 : LightGBM supports GPU-accelerated tree learning now. Please read our [GPU Tutorial](./docs/GPU-Tutorial.md) and [Performance Comparison](./docs/GPU-Performance.rst).
+04/10/2017 : LightGBM supports GPU-accelerated tree learning now. Please read our [GPU Tutorial](./docs/GPU-Tutorial.rst) and [Performance Comparison](./docs/GPU-Performance.rst).
 
 02/20/2017 : Update to LightGBM v2.
 
@@ -62,22 +62,22 @@ JPMML: https://github.com/jpmml/jpmml-lightgbm
 Get Started and Documentation
 -----------------------------
 
-Install by following the [guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst) for the command line program, [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) or [R-package](https://github.com/Microsoft/LightGBM/tree/master/R-package). Then please see the [Quick Start](https://github.com/Microsoft/LightGBM/blob/master/docs/Quick-Start.md) guide.
+Install by following the [guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst) for the command line program, [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) or [R-package](https://github.com/Microsoft/LightGBM/tree/master/R-package). Then please see the [Quick Start](https://github.com/Microsoft/LightGBM/blob/master/docs/Quick-Start.rst) guide.
 
 Our primary documentation is at https://lightgbm.readthedocs.io/ and is generated from this repository.
 
 Next you may want to read:
 
 * [**Examples**](https://github.com/Microsoft/LightGBM/tree/master/examples) showing command line usage of common tasks
-* [**Features**](https://github.com/Microsoft/LightGBM/blob/master/docs/Features.md) and algorithms supported by LightGBM
-* [**Parameters**](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.md) is an exhaustive list of customization you can make
-* [**Parallel Learning**](https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/Microsoft/LightGBM/blob/master/docs/GPU-Tutorial.md) can speed up computation
+* [**Features**](https://github.com/Microsoft/LightGBM/blob/master/docs/Features.rst) and algorithms supported by LightGBM
+* [**Parameters**](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst) is an exhaustive list of customization you can make
+* [**Parallel Learning**](https://github.com/Microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/Microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation
 * [**Laurae++ interactive documentation**](https://sites.google.com/view/lauraepp/parameters) is a detailed guide for hyperparameters
 
 Documentation for contributors:
 
 * [**How we Update readthedocs.io**](https://github.com/Microsoft/LightGBM/blob/master/docs/README.md)
-* Check out the [Development Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/development.rst).
+* Check out the [Development Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Development-Guide.rst).
 
 Support
 -------
diff --git a/docs/Advanced-Topic.md b/docs/Advanced-Topic.md
deleted file mode 100644
index 850be77fd..000000000
--- a/docs/Advanced-Topic.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Advanced Topics
-
-## Missing Value Handle
-
-* LightGBM enables the missing value handle by default, you can disable it by set ```use_missing=false```.
-* LightGBM uses NA (NAN) to represent the missing value by default, you can change it to use zero by set ```zero_as_missing=true```.
-* When ```zero_as_missing=false``` (default), the unshown value in sparse matrices (and LightSVM) is treated as zeros.
-* When ```zero_as_missing=true```, NA and zeros (including unshown value in sparse matrices (and LightSVM)) are treated as missing.
-
-## Categorical Feature Support
-
-* LightGBM can offer a good accuracy when using native categorical features. Not like simply one-hot coding, LightGBM can find the optimal split of categorical features. Such an optimal split can provide the much better accuracy than one-hot coding solution.
-* Use `categorical_feature` to specify the categorical features. Refer to the parameter `categorical_feature` in [Parameters](./Parameters.md).
-* Converting to `int` type is needed first, and there is support for non-negative numbers only. It is better to convert into continues ranges.
-* Use `max_cat_group`, `cat_smooth_ratio` to deal with over-fitting (when #data is small or #category is large).
-* For categorical features with high cardinality (#category is large), it is better to convert it to numerical features.
-
-## LambdaRank
-
-* The label should be `int` type, and larger numbers represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
-* Use `label_gain` to set the gain(weight) of `int` label.
-* Use `max_position` to set the NDCG optimization position.
-
-## Parameters Tuning
-
-* Refer to [Parameters Tuning](./Parameters-tuning.md).
-
-## GPU Support
-
-* Refer to [GPU Tutorial](./GPU-Tutorial.md) and [GPU Targets](./GPU-Targets.rst).
-
-## Parallel Learning
-
-* Refer to [Parallel Learning Guide](./Parallel-Learning-Guide.rst).
diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst
new file mode 100644
index 000000000..eebad3cdc
--- /dev/null
+++ b/docs/Advanced-Topics.rst
@@ -0,0 +1,59 @@
+Advanced Topics
+===============
+
+Missing Value Handle
+--------------------
+
+-  LightGBM enables the missing value handle by default, you can disable it by set ``use_missing=false``.
+
+-  LightGBM uses NA (NaN) to represent the missing value by default, you can change it to use zero by set ``zero_as_missing=true``.
+
+-  When ``zero_as_missing=false`` (default), the unshown value in sparse matrices (and LightSVM) is treated as zeros.
+
+-  When ``zero_as_missing=true``, NA and zeros (including unshown value in sparse matrices (and LightSVM)) are treated as missing.
+
+Categorical Feature Support
+---------------------------
+
+-  LightGBM can offer a good accuracy when using native categorical features. Not like simply one-hot coding, LightGBM can find the optimal split of categorical features.
+   Such an optimal split can provide the much better accuracy than one-hot coding solution.
+
+-  Use ``categorical_feature`` to specify the categorical features.
+   Refer to the parameter ``categorical_feature`` in `Parameters <./Parameters.rst>`__.
+
+-  Converting to ``int`` type is needed first, and there is support for non-negative numbers only.
+   It is better to convert into continues ranges.
+
+-  Use ``max_cat_group``, ``cat_smooth_ratio`` to deal with over-fitting
+   (when ``#data`` is small or ``#category`` is large).
+
+-  For categorical features with high cardinality (``#category`` is large), it is better to convert it to numerical features.
+
+LambdaRank
+----------
+
+-  The label should be ``int`` type, and larger numbers represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
+
+-  Use ``label_gain`` to set the gain(weight) of ``int`` label.
+
+-  Use ``max_position`` to set the NDCG optimization position.
+
+Parameters Tuning
+-----------------
+
+-  Refer to `Parameters Tuning <./Parameters-Tuning.rst>`__.
+
+Parallel Learning
+-----------------
+
+-  Refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__.
+
+GPU Support
+-----------
+
+-  Refer to `GPU Tutorial <./GPU-Tutorial.rst>`__ and `GPU Targets <./GPU-Targets.rst>`__.
+
+Recommendations for gcc Users (MinGW, \*nix)
+--------------------------------------------
+
+-  Refer to `gcc Tips <./gcc-Tips.rst>`__.
diff --git a/docs/development.rst b/docs/Development-Guide.rst
similarity index 96%
rename from docs/development.rst
rename to docs/Development-Guide.rst
index 880d03793..f9e5ac1c5 100644
--- a/docs/development.rst
+++ b/docs/Development-Guide.rst
@@ -4,7 +4,7 @@ Development Guide
 Algorithms
 ----------
 
-Refer to `Features <./Features.md>`__ to understand important algorithms used in LightGBM.
+Refer to `Features <./Features.rst>`__ to understand important algorithms used in LightGBM.
 
 Classes and Code Structure
 --------------------------
@@ -68,9 +68,7 @@ Code Structure
 Documents API
 ~~~~~~~~~~~~~
 
-LightGBM support use `doxygen <http://www.stack.nl/~dimitri/doxygen/>`__ to generate documents for classes and functions.
-
-Refer to `docs README <./README.md>`__.
+Refer to `docs README <./README.rst>`__.
 
 C API
 -----
@@ -85,6 +83,6 @@ See the implementations at `Python-package <https://github.com/Microsoft/LightGB
 Questions
 ---------
 
-Refer to `FAQ <./FAQ.md>`__.
+Refer to `FAQ <./FAQ.rst>`__.
 
 Also feel free to open `issues <https://github.com/Microsoft/LightGBM/issues>`__ if you met problems.
diff --git a/docs/FAQ.md b/docs/FAQ.md
deleted file mode 100644
index c3ac9126e..000000000
--- a/docs/FAQ.md
+++ /dev/null
@@ -1,138 +0,0 @@
-LightGBM FAQ
-============
-
-### Contents
-
-- [Critical](#critical)
-- [LightGBM](#lightgbm)
-- [R-package](#r-package)
-- [Python-package](#python-package)
-
----
-
-### Critical
-
-You encountered a critical issue when using LightGBM (crash, prediction error, non sense outputs...). Who should you contact?
-
-If your issue is not critical, just post an issue in [Microsoft/LightGBM repository](https://github.com/Microsoft/LightGBM/issues).
-
-If it is a critical issue, identify first what error you have:
-
-* Do you think it is reproducible on CLI (command line interface), R, and/or Python?
-* Is it specific to a wrapper? (R or Python?)
-* Is it specific to the compiler? (gcc versions? MinGW versions?)
-* Is it specific to your Operating System? (Windows? Linux?)
-* Are you able to reproduce this issue with a simple case?
-* Are you able to (not) reproduce this issue after removing all optimization flags and compiling LightGBM in debug mode?
-
-Depending on the answers, while opening your issue, feel free to ping (just mention them with the arobase (@) symbol) appropriately so we can attempt to solve your problem faster:
-
-* [@guolinke](https://github.com/guolinke) (C++ code / R-package / Python-package)
-* [@Laurae2](https://github.com/Laurae2) (R-package)
-* [@wxchan](https://github.com/wxchan) (Python-package)
-* [@henry0312](https://github.com/henry0312) (Python-package)
-* [@StrikerRUS](https://github.com/StrikerRUS) (Python-package)
-* [@huanzhang12](https://github.com/huanzhang12) (GPU support)
-
-Remember this is a free/open community support. We may not be available 24/7 to provide support.
-
----
-
-### LightGBM
-
-- **Question 1**: Where do I find more details about LightGBM parameters?
-
-- **Solution 1**: Look at [Parameters](./Parameters.md) and [Laurae++/Parameters](https://sites.google.com/view/lauraepp/parameters) website.
-
----
-
-- **Question 2**: On datasets with million of features, training do not start (or starts after a very long time).
-
-- **Solution 2**: Use a smaller value for `bin_construct_sample_cnt` and a larger value for `min_data`.
-
----
-
-- **Question 3**: When running LightGBM on a large dataset, my computer runs out of RAM.
-
-- **Solution 3**: Multiple solutions: set `histogram_pool_size` parameter to the MB you want to use for LightGBM (histogram_pool_size + dataset size = approximately RAM used), lower `num_leaves` or lower `max_bin` (see [Microsoft/LightGBM#562](https://github.com/Microsoft/LightGBM/issues/562)).
-
----
-
-- **Question 4**: I am using Windows. Should I use Visual Studio or MinGW for compiling LightGBM?
-
-- **Solution 4**: It is recommended to [use Visual Studio](https://github.com/Microsoft/LightGBM/issues/542) as its performance is higher for LightGBM.
-
----
-
-- **Question 5**: When using LightGBM GPU, I cannot reproduce results over several runs.
-
-- **Solution 5**: It is a normal issue, there is nothing we/you can do about, you may try to use `gpu_use_dp = true` for reproducibility (see [Microsoft/LightGBM#560](https://github.com/Microsoft/LightGBM/pull/560#issuecomment-304561654)). You may also use CPU version.
-
----
-
-- **Question 6**: Bagging is not reproducible when changing the number of threads.
-
-- **Solution 6**: As LightGBM bagging is running multithreaded, its output is dependent on the number of threads used. There is [no workaround currently](https://github.com/Microsoft/LightGBM/issues/632).
-
----
-
-- **Question 7**: I tried to use Random Forest mode, and LightGBM crashes!
-
-- **Solution 7**: It is by design. You must use `bagging_fraction` and `feature_fraction` different from 1, along with a `bagging_freq`. See [this thread](https://github.com/Microsoft/LightGBM/issues/691) as an example.
-
----
-
-- **Question 8**: CPU are not kept busy (like 10% CPU usage only) in Windows when using LightGBM on very large datasets with many core systems.
-
-- **Solution 8**: Please use [Visual Studio](https://www.visualstudio.com/downloads/) as it may be [10x faster than MinGW](https://github.com/Microsoft/LightGBM/issues/749) especially for very large trees.
-
----
-
-### R-package
-
-- **Question 1**: Any training command using LightGBM does not work after an error occurred during the training of a previous LightGBM model.
-
-- **Solution 1**: Run `lgb.unloader(wipe = TRUE)` in the R console, and recreate the LightGBM datasets (this will wipe all LightGBM-related variables). Due to the pointers, choosing to not wipe variables will not fix the error. This is a known issue: [Microsoft/LightGBM#698](https://github.com/Microsoft/LightGBM/issues/698).
-
-- **Question 2**: I used `setinfo`, tried to print my `lgb.Dataset`, and now the R console froze!
-
-- **Solution 2**: Avoid printing the `lgb.Dataset` after using `setinfo`. This is a known bug: [Microsoft/LightGBM#539](https://github.com/Microsoft/LightGBM/issues/539).
-
----
-
-### Python-package
-
-- **Question 1**: I see error messages like this when install from github using `python setup.py install`.
-
-    ```
-    error: Error: setup script specifies an absolute path:
-
-    /Users/Microsoft/LightGBM/python-package/lightgbm/../../lib_lightgbm.so
-
-    setup() arguments must *always* be /-separated paths relative to the
-    setup.py directory, *never* absolute paths.
-    ```
-
-- **Solution 1**: this error should be solved in latest version. If you still meet this error, try to remove lightgbm.egg-info folder in your Python-package and reinstall, or check [this thread on stackoverflow](http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path).
-
----
-
-- **Question 2**: I see error messages like 
-    ```
-    Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset
-    ```
-    but I already construct dataset by some code like
-    ```
-    train = lightgbm.Dataset(X_train, y_train)
-    ```
-    or error messages like
-    ```
-    Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.
-    ```
-
-- **Solution 2**: Because LightGBM constructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers, categorical features and feature names etc., the Dataset objects are constructed when construct a Booster. And if you set `free_raw_data=True` (default), the raw data (with Python data struct) will be freed. So, if you want to:
-
-  + get label(or weight/init_score/group) before construct dataset, it's same as get `self.label`
-  + set label(or weight/init_score/group) before construct dataset, it's same as `self.label=some_label_array`
-  + get num_data(or num_feature) before construct dataset, you can get data with `self.data`, then if your data is `numpy.ndarray`, use some code like `self.data.shape`
-  + set predictor(or reference/categorical feature) after construct dataset, you should set `free_raw_data=False` or init a Dataset object with the same raw data
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
new file mode 100644
index 000000000..1d45b1ecf
--- /dev/null
+++ b/docs/FAQ.rst
@@ -0,0 +1,175 @@
+LightGBM FAQ
+============
+
+Contents
+~~~~~~~~
+
+-  `Critical <#critical>`__
+
+-  `LightGBM <#lightgbm>`__
+
+-  `R-package <#r-package>`__
+
+-  `Python-package <#python-package>`__
+
+--------------
+
+Critical
+~~~~~~~~
+
+You encountered a critical issue when using LightGBM (crash, prediction error, non sense outputs...). Who should you contact?
+
+If your issue is not critical, just post an issue in `Microsoft/LightGBM repository <https://github.com/Microsoft/LightGBM/issues>`__.
+
+If it is a critical issue, identify first what error you have:
+
+-  Do you think it is reproducible on CLI (command line interface), R, and/or Python?
+
+-  Is it specific to a wrapper? (R or Python?)
+
+-  Is it specific to the compiler? (gcc versions? MinGW versions?)
+
+-  Is it specific to your Operating System? (Windows? Linux?)
+
+-  Are you able to reproduce this issue with a simple case?
+
+-  Are you able to (not) reproduce this issue after removing all optimization flags and compiling LightGBM in debug mode?
+
+Depending on the answers, while opening your issue, feel free to ping (just mention them with the arobase (@) symbol) appropriately so we can attempt to solve your problem faster:
+
+-  `@guolinke <https://github.com/guolinke>`__ (C++ code / R-package / Python-package)
+-  `@Laurae2 <https://github.com/Laurae2>`__ (R-package)
+-  `@wxchan <https://github.com/wxchan>`__ (Python-package)
+-  `@henry0312 <https://github.com/henry0312>`__ (Python-package)
+-  `@StrikerRUS <https://github.com/StrikerRUS>`__ (Python-package)
+-  `@huanzhang12 <https://github.com/huanzhang12>`__ (GPU support)
+
+Remember this is a free/open community support. We may not be available 24/7 to provide support.
+
+--------------
+
+LightGBM
+~~~~~~~~
+
+-  **Question 1**: Where do I find more details about LightGBM parameters?
+
+-  **Solution 1**: Take a look at `Parameters <./Parameters.rst>`__ and `Laurae++/Parameters <https://sites.google.com/view/lauraepp/parameters>`__ website.
+
+--------------
+
+-  **Question 2**: On datasets with million of features, training do not start (or starts after a very long time).
+
+-  **Solution 2**: Use a smaller value for ``bin_construct_sample_cnt`` and a larger value for ``min_data``.
+
+--------------
+
+-  **Question 3**: When running LightGBM on a large dataset, my computer runs out of RAM.
+
+-  **Solution 3**: Multiple solutions: set ``histogram_pool_size`` parameter to the MB you want to use for LightGBM (histogram\_pool\_size + dataset size = approximately RAM used),
+   lower ``num_leaves`` or lower ``max_bin`` (see `Microsoft/LightGBM#562 <https://github.com/Microsoft/LightGBM/issues/562>`__).
+
+--------------
+
+-  **Question 4**: I am using Windows. Should I use Visual Studio or MinGW for compiling LightGBM?
+
+-  **Solution 4**: It is recommended to `use Visual Studio <https://github.com/Microsoft/LightGBM/issues/542>`__ as its performance is higher for LightGBM.
+
+--------------
+
+-  **Question 5**: When using LightGBM GPU, I cannot reproduce results over several runs.
+
+-  **Solution 5**: It is a normal issue, there is nothing we/you can do about,
+   you may try to use ``gpu_use_dp = true`` for reproducibility (see `Microsoft/LightGBM#560 <https://github.com/Microsoft/LightGBM/pull/560#issuecomment-304561654>`__).
+   You may also use CPU version.
+
+--------------
+
+-  **Question 6**: Bagging is not reproducible when changing the number of threads.
+
+-  **Solution 6**: As LightGBM bagging is running multithreaded, its output is dependent on the number of threads used.
+   There is `no workaround currently <https://github.com/Microsoft/LightGBM/issues/632>`__.
+
+--------------
+
+-  **Question 7**: I tried to use Random Forest mode, and LightGBM crashes!
+
+-  **Solution 7**: It is by design.
+   You must use ``bagging_fraction`` and ``feature_fraction`` different from 1, along with a ``bagging_freq``.
+   See `this thread <https://github.com/Microsoft/LightGBM/issues/691>`__ as an example.
+
+--------------
+
+-  **Question 8**: CPU are not kept busy (like 10% CPU usage only) in Windows when using LightGBM on very large datasets with many core systems.
+
+-  **Solution 8**: Please use `Visual Studio <https://www.visualstudio.com/downloads/>`__
+   as it may be `10x faster than MinGW <https://github.com/Microsoft/LightGBM/issues/749>`__ especially for very large trees.
+
+--------------
+
+R-package
+~~~~~~~~~
+
+-  **Question 1**: Any training command using LightGBM does not work after an error occurred during the training of a previous LightGBM model.
+
+-  **Solution 1**: Run ``lgb.unloader(wipe = TRUE)`` in the R console, and recreate the LightGBM datasets (this will wipe all LightGBM-related variables).
+   Due to the pointers, choosing to not wipe variables will not fix the error.
+   This is a known issue: `Microsoft/LightGBM#698 <https://github.com/Microsoft/LightGBM/issues/698>`__.
+
+--------------
+
+-  **Question 2**: I used ``setinfo``, tried to print my ``lgb.Dataset``, and now the R console froze!
+
+-  **Solution 2**: Avoid printing the ``lgb.Dataset`` after using ``setinfo``.
+   This is a known bug: `Microsoft/LightGBM#539 <https://github.com/Microsoft/LightGBM/issues/539>`__.
+
+--------------
+
+Python-package
+~~~~~~~~~~~~~~
+
+-  **Question 1**: I see error messages like this when install from GitHub using ``python setup.py install``.
+
+   ::
+
+       error: Error: setup script specifies an absolute path:
+       /Users/Microsoft/LightGBM/python-package/lightgbm/../../lib_lightgbm.so
+       setup() arguments must *always* be /-separated paths relative to the setup.py directory, *never* absolute paths.
+
+-  **Solution 1**: This error should be solved in latest version.
+   If you still meet this error, try to remove ``lightgbm.egg-info`` folder in your Python-package and reinstall,
+   or check `this thread on stackoverflow <http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__.
+
+--------------
+
+-  **Question 2**: I see error messages like
+
+   ::
+
+       Cannot get/set label/weight/init_score/group/num_data/num_feature before construct dataset
+
+   but I've already constructed dataset by some code like
+
+   ::
+
+       train = lightgbm.Dataset(X_train, y_train)
+
+   or error messages like
+
+   ::
+
+       Cannot set predictor/reference/categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.
+
+-  **Solution 2**: Because LightGBM constructs bin mappers to build trees, and train and valid Datasets within one Booster share the same bin mappers,
+   categorical features and feature names etc., the Dataset objects are constructed when construct a Booster.
+   And if you set ``free_raw_data=True`` (default), the raw data (with Python data struct) will be freed.
+   So, if you want to:
+
+   -  get label(or weight/init\_score/group) before construct dataset, it's same as get ``self.label``
+
+   -  set label(or weight/init\_score/group) before construct dataset, it's same as ``self.label=some_label_array``
+
+   -  get num\_data(or num\_feature) before construct dataset, you can get data with ``self.data``,
+      then if your data is ``numpy.ndarray``, use some code like ``self.data.shape``
+
+   -  set predictor(or reference/categorical feature) after construct dataset,
+      you should set ``free_raw_data=False`` or init a Dataset object with the same raw data
diff --git a/docs/Features.md b/docs/Features.md
deleted file mode 100644
index c5876bf0a..000000000
--- a/docs/Features.md
+++ /dev/null
@@ -1,183 +0,0 @@
-# Features
-
-This is a short introduction for the features and algorithms used in LightGBM.
-
-This page doesn't contain detailed algorithms, please refer to cited papers or source code if you are interested.
-
-## Optimization in Speed and Memory Usage
-
-Many boosting tools use pre-sorted based algorithms[[1, 2]](#references) (e.g. default algorithm in xgboost) for decision tree learning. It is a simple solution, but not easy to optimize.
-
-LightGBM uses the histogram based algorithms[[3, 4, 5]](#references), which bucketing continuous feature(attribute) values into discrete bins, to speed up training procedure and reduce memory usage. Following are advantages for histogram based algorithms:
-
-- **Reduce calculation cost of split gain**
-  - Pre-sorted based algorithms need ``O(#data)`` times calculation
-  - Histogram based algorithms only need to calculate ``O(#bins)`` times, and ``#bins`` is far smaller than ``#data``
-    - It still needs ``O(#data)`` times to construct histogram, which only contain sum-up operation
-- **Use histogram subtraction for further speed-up**
-  - To get one leaf's histograms in a binary tree, can use the histogram subtraction of its parent and its neighbor
-  - So it only need to construct histograms for one leaf (with smaller ``#data`` than its neighbor), then can get histograms of its neighbor by histogram subtraction with small cost(``O(#bins)``)
-- **Reduce memory usage**
-  - Can replace continuous values to discrete bins. If ``#bins`` is small, can use small data type, e.g. uint8_t, to store training data
-  - No need to store additional information for pre-sorting feature values
-- **Reduce communication cost for parallel learning**
-
-## Sparse Optimization
-
-- Only need ``O(2 * #non_zero_data)`` to construct histogram for sparse features
-
-## Optimization in Accuracy
-
-### Leaf-wise (Best-first) Tree Growth
-
-Most decision tree learning algorithms grow tree by level(depth)-wise, like the following image:
-
-![level_wise](./_static/images/level-wise.png)
-
-LightGBM grows tree by leaf-wise(best-first)[[6]](#references). It will choose the leaf with max delta loss to grow. When growing same ``#leaf``, leaf-wise algorithm can reduce more loss than level-wise algorithm.
-
-Leaf-wise may cause over-fitting when ``#data`` is small. So, LightGBM can use an additional parameter ``max_depth`` to limit depth of tree and avoid over-fitting (tree still grows by leaf-wise).
-
-![leaf_wise](./_static/images/leaf-wise.png)
-
-### Optimal Split for Categorical Features
-
-We often convert the categorical features into one-hot coding. However, it is not a good solution in tree learner. The reason is, for the high cardinality categorical features, it will grow the very unbalance tree, and needs to grow very deep to achieve the good accuracy.
-
-Actually, the optimal solution is partitioning the categorical feature into 2 subsets, and there are ``2^(k-1) - 1`` possible partitions. But there is a efficient solution for regression tree[[7]](#references). It needs about ``k * log(k)`` to find the optimal partition.
-
-The basic idea is reordering the categories according to the relevance of training target. More specifically, reordering the histogram (of categorical feature) according to it's accumulate values (``sum_gradient / sum_hessian``), then find the best split on the sorted histogram.
-
-## Optimization in Network Communication
-
-It only needs to use some collective communication algorithms, like "All reduce", "All gather" and "Reduce scatter", in parallel learning of LightGBM. LightGBM implement state-of-art algorithms[[8]](#references). These collective communication algorithms can provide much better performance than point-to-point communication.
-
-## Optimization in Parallel Learning
-
-LightGBM provides following parallel learning algorithms.
-
-### Feature Parallel
-
-#### Traditional Algorithm
-
-Feature parallel aims to parallel the "Find Best Split" in the decision tree. The procedure of traditional feature parallel is:
-
-1. Partition data vertically (different machines have different feature set)
-2. Workers find local best split point {feature, threshold} on local feature set
-3. Communicate local best splits with each other and get the best one
-4. Worker with best split to perform split, then send the split result of data to other workers
-5. Other workers split data according received data
-
-The shortage of traditional feature parallel:
-
-- Has computation overhead, since it cannot speed up "split", whose time complexity is ``O(#data)``. Thus, feature parallel cannot speed up well when ``#data`` is large.
-- Need communication of split result, which cost about ``O(#data / 8)`` (one bit for one data).
-
-#### Feature Parallel in LightGBM
-
-Since feature parallel cannot speed up well when ``#data`` is large, we make a little change here: instead of partitioning data vertically, every worker holds the full data. Thus, LightGBM doesn't need to communicate for split result of data since every worker know how to split data. And ``#data`` won't be larger, so it is reasonable to hold full data in every machine.
-
-The procedure of feature parallel in LightGBM:
-
-1. Workers find local best split point{feature, threshold} on local feature set
-2. Communicate local best splits with each other and get the best one
-3. Perform best split
-
-However, this feature parallel algorithm still suffers from computation overhead for "split" when ``#data`` is large. So it will be better to use data parallel when ``#data`` is large.
-
-### Data Parallel
-
-#### Traditional Algorithm
-
-Data parallel aims to parallel the whole decision learning. The procedure of data parallel is:
-
-1. Partition data horizontally
-2. Workers use local data to construct local histograms
-3. Merge global histograms from all local histograms
-4. Find best split from merged global histograms, then perform splits
-
-The shortage of traditional data parallel:
-
-- High communication cost. If using point-to-point communication algorithm, communication cost for one machine is about ``O(#machine * #feature * #bin)``. If using collective communication algorithm (e.g. "All Reduce"), communication cost is about ``O(2 * #feature * #bin)`` (check cost of "All Reduce" in chapter 4.5 at [[8]](#references)).
-
-#### Data Parallel in LightGBM
-
-We reduce communication cost of data parallel in LightGBM:
-
-1. Instead of "Merge global histograms from all local histograms", LightGBM use "Reduce Scatter" to merge histograms of different(non-overlapping) features for different workers. Then workers find local best split on local merged histograms and sync up global best split.
-2. As aforementioned, LightGBM use histogram subtraction to speed up training. Based on this, we can communicate histograms only for one leaf, and get its neighbor's histograms by subtraction as well.
-
-Above all, we reduce communication cost to ``O(0.5 * #feature * #bin)`` for data parallel in LightGBM.
-
-### Voting Parallel
-
-Voting parallel further reduce the communication cost in [Data Parallel](#data-parallel) to constant cost. It uses two stage voting to reduce the communication cost of feature histograms[[9]](#references).
-
-## GPU Support
-
-Thanks [@huanzhang12](https://github.com/huanzhang12) for contributing this feature. Please read[[10]](#references) to get more details.
-
-- [GPU Installation](./Installation-Guide.rst)
-- [GPU Tutorial](./GPU-Tutorial.md)
-
-## Applications and Metrics
-
-Support following application:
-
-- regression, the objective function is L2 loss
-- binary classification, the objective function is logloss
-- multi classification
-- lambdarank, the objective function is lambdarank with NDCG
-
-Support following metrics:
-
-- L1 loss
-- L2 loss
-- Log loss
-- Classification error rate
-- AUC
-- NDCG
-- Multi class log loss
-- Multi class error rate
-
-For more details, please refer to [Parameters](./Parameters.md).
-
-## Other Features
-
-- Limit ``max_depth`` of tree while grows tree leaf-wise
-- [DART](https://arxiv.org/abs/1505.01866)
-- L1/L2 regularization
-- Bagging
-- Column(feature) sub-sample
-- Continued train with input GBDT model
-- Continued train with the input score file
-- Weighted training
-- Validation metric output during training
-- Multi validation data
-- Multi metrics
-- Early stopping (both training and prediction)
-- Prediction for leaf index
-
-For more details, please refer to [Parameters](./Parameters.md).
-
-## References
-
-[1] Mehta, Manish, Rakesh Agrawal, and Jorma Rissanen. "SLIQ: A fast scalable classifier for data mining." International Conference on Extending Database Technology. Springer Berlin Heidelberg, 1996.
-
-[2] Shafer, John, Rakesh Agrawal, and Manish Mehta. "SPRINT: A scalable parallel classifier for data mining." Proc. 1996 Int. Conf. Very Large Data Bases. 1996.
-
-[3] Ranka, Sanjay, and V. Singh. "CLOUDS: A decision tree classifier for large datasets." Proceedings of the 4th Knowledge Discovery and Data Mining Conference. 1998.
-
-[4] Machado, F. P. "Communication and memory efficient parallel decision tree construction." (2003).
-
-[5] Li, Ping, Qiang Wu, and Christopher J. Burges. "Mcrank: Learning to rank using multiple classification and gradient boosting." Advances in neural information processing systems. 2007.
-
-[6] Shi, Haijian. "Best-first decision tree learning." Diss. The University of Waikato, 2007.
-
-[7] Walter D. Fisher. "[On Grouping for Maximum Homogeneity](http://amstat.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479)." Journal of the American Statistical Association. Vol. 53, No. 284 (Dec., 1958), pp. 789-798.
-
-[8] Thakur, Rajeev, Rolf Rabenseifner, and William Gropp. "[Optimization of collective communication operations in MPICH](http://wwwi10.lrr.in.tum.de/~gerndt/home/Teaching/HPCSeminar/mpich_multi_coll.pdf)." International Journal of High Performance Computing Applications 19.1 (2005): 49-66.
-
-[9] Qi Meng, Guolin Ke, Taifeng Wang, Wei Chen, Qiwei Ye, Zhi-Ming Ma, Tieyan Liu. "[A Communication-Efficient Parallel Algorithm for Decision Tree](http://papers.nips.cc/paper/6381-a-communication-efficient-parallel-algorithm-for-decision-tree)." Advances in Neural Information Processing Systems 29 (NIPS 2016).
-
-[10] Huan Zhang, Si Si and Cho-Jui Hsieh. "[GPU Acceleration for Large-scale Tree Boosting](https://arxiv.org/abs/1706.08359)." arXiv:1706.08359, 2017.
diff --git a/docs/Features.rst b/docs/Features.rst
new file mode 100644
index 000000000..525b4abce
--- /dev/null
+++ b/docs/Features.rst
@@ -0,0 +1,273 @@
+Features
+========
+
+This is a short introduction for the features and algorithms used in LightGBM.
+
+This page doesn't contain detailed algorithms, please refer to cited papers or source code if you are interested.
+
+Optimization in Speed and Memory Usage
+--------------------------------------
+
+Many boosting tools use pre-sorted based algorithms\ `[1, 2] <#references>`__ (e.g. default algorithm in xgboost) for decision tree learning. It is a simple solution, but not easy to optimize.
+
+LightGBM uses the histogram based algorithms\ `[3, 4, 5] <#references>`__, which bucketing continuous feature(attribute) values into discrete bins, to speed up training procedure and reduce memory usage.
+Following are advantages for histogram based algorithms:
+
+-  **Reduce calculation cost of split gain**
+
+   -  Pre-sorted based algorithms need ``O(#data)`` times calculation
+
+   -  Histogram based algorithms only need to calculate ``O(#bins)`` times, and ``#bins`` is far smaller than ``#data``
+
+      -  It still needs ``O(#data)`` times to construct histogram, which only contain sum-up operation
+
+-  **Use histogram subtraction for further speed-up**
+
+   -  To get one leaf's histograms in a binary tree, can use the histogram subtraction of its parent and its neighbor
+
+   -  So it only need to construct histograms for one leaf (with smaller ``#data`` than its neighbor), then can get histograms of its neighbor by histogram subtraction with small cost(``O(#bins)``)
+-  **Reduce memory usage**
+
+   -  Can replace continuous values to discrete bins. If ``#bins`` is small, can use small data type, e.g. uint8\_t, to store training data
+
+   -  No need to store additional information for pre-sorting feature values
+
+-  **Reduce communication cost for parallel learning**
+
+Sparse Optimization
+-------------------
+
+-  Only need ``O(2 * #non_zero_data)`` to construct histogram for sparse features
+
+Optimization in Accuracy
+------------------------
+
+Leaf-wise (Best-first) Tree Growth
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Most decision tree learning algorithms grow tree by level(depth)-wise, like the following image:
+
+.. image:: ./_static/images/level-wise.png
+   :align: center
+
+LightGBM grows tree by leaf-wise (best-first)\ `[6] <#references>`__. It will choose the leaf with max delta loss to grow.
+When growing same ``#leaf``, leaf-wise algorithm can reduce more loss than level-wise algorithm.
+
+Leaf-wise may cause over-fitting when ``#data`` is small.
+So, LightGBM can use an additional parameter ``max_depth`` to limit depth of tree and avoid over-fitting (tree still grows by leaf-wise).
+
+.. image:: ./_static/images/leaf-wise.png
+   :align: center
+
+Optimal Split for Categorical Features
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We often convert the categorical features into one-hot coding.
+However, it is not a good solution in tree learner.
+The reason is, for the high cardinality categorical features, it will grow the very unbalance tree, and needs to grow very deep to achieve the good accuracy.
+
+Actually, the optimal solution is partitioning the categorical feature into 2 subsets, and there are ``2^(k-1) - 1`` possible partitions.
+But there is a efficient solution for regression tree\ `[7] <#references>`__. It needs about ``k * log(k)`` to find the optimal partition.
+
+The basic idea is reordering the categories according to the relevance of training target.
+More specifically, reordering the histogram (of categorical feature) according to it's accumulate values (``sum_gradient / sum_hessian``), then find the best split on the sorted histogram.
+
+Optimization in Network Communication
+-------------------------------------
+
+It only needs to use some collective communication algorithms, like "All reduce", "All gather" and "Reduce scatter", in parallel learning of LightGBM.
+LightGBM implement state-of-art algorithms\ `[8] <#references>`__.
+These collective communication algorithms can provide much better performance than point-to-point communication.
+
+Optimization in Parallel Learning
+---------------------------------
+
+LightGBM provides following parallel learning algorithms.
+
+Feature Parallel
+~~~~~~~~~~~~~~~~
+
+Traditional Algorithm
+^^^^^^^^^^^^^^^^^^^^^
+
+Feature parallel aims to parallel the "Find Best Split" in the decision tree. The procedure of traditional feature parallel is:
+
+1. Partition data vertically (different machines have different feature set)
+
+2. Workers find local best split point {feature, threshold} on local feature set
+
+3. Communicate local best splits with each other and get the best one
+
+4. Worker with best split to perform split, then send the split result of data to other workers
+
+5. Other workers split data according received data
+
+The shortage of traditional feature parallel:
+
+-  Has computation overhead, since it cannot speed up "split", whose time complexity is ``O(#data)``.
+   Thus, feature parallel cannot speed up well when ``#data`` is large.
+
+-  Need communication of split result, which cost about ``O(#data / 8)`` (one bit for one data).
+
+Feature Parallel in LightGBM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since feature parallel cannot speed up well when ``#data`` is large, we make a little change here: instead of partitioning data vertically, every worker holds the full data.
+Thus, LightGBM doesn't need to communicate for split result of data since every worker know how to split data.
+And ``#data`` won't be larger, so it is reasonable to hold full data in every machine.
+
+The procedure of feature parallel in LightGBM:
+
+1. Workers find local best split point {feature, threshold} on local feature set
+
+2. Communicate local best splits with each other and get the best one
+
+3. Perform best split
+
+However, this feature parallel algorithm still suffers from computation overhead for "split" when ``#data`` is large.
+So it will be better to use data parallel when ``#data`` is large.
+
+Data Parallel
+~~~~~~~~~~~~~
+
+Traditional Algorithm
+^^^^^^^^^^^^^^^^^^^^^
+
+Data parallel aims to parallel the whole decision learning. The procedure of data parallel is:
+
+1. Partition data horizontally
+
+2. Workers use local data to construct local histograms
+
+3. Merge global histograms from all local histograms
+
+4. Find best split from merged global histograms, then perform splits
+
+The shortage of traditional data parallel:
+
+-  High communication cost.
+   If using point-to-point communication algorithm, communication cost for one machine is about ``O(#machine * #feature * #bin)``.
+   If using collective communication algorithm (e.g. "All Reduce"), communication cost is about ``O(2 * #feature * #bin)`` (check cost of "All Reduce" in chapter 4.5 at `[8] <#references>`__).
+
+Data Parallel in LightGBM
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We reduce communication cost of data parallel in LightGBM:
+
+1. Instead of "Merge global histograms from all local histograms", LightGBM use "Reduce Scatter" to merge histograms of different(non-overlapping) features for different workers.
+   Then workers find local best split on local merged histograms and sync up global best split.
+
+2. As aforementioned, LightGBM use histogram subtraction to speed up training.
+   Based on this, we can communicate histograms only for one leaf, and get its neighbor's histograms by subtraction as well.
+
+Above all, we reduce communication cost to ``O(0.5 * #feature * #bin)`` for data parallel in LightGBM.
+
+Voting Parallel
+~~~~~~~~~~~~~~~
+
+Voting parallel further reduce the communication cost in `Data Parallel <#data-parallel>`__ to constant cost.
+It uses two stage voting to reduce the communication cost of feature histograms\ `[9] <#references>`__.
+
+GPU Support
+-----------
+
+Thanks `@huanzhang12 <https://github.com/huanzhang12>`__ for contributing this feature. Please read `[10] <#references>`__ to get more details.
+
+- `GPU Installation <./Installation-Guide.rst#build-gpu-version>`__
+
+- `GPU Tutorial <./GPU-Tutorial.rst>`__
+
+Applications and Metrics
+------------------------
+
+Support following application:
+
+-  regression, the objective function is L2 loss
+
+-  binary classification, the objective function is logloss
+
+-  multi classification
+
+-  lambdarank, the objective function is lambdarank with NDCG
+
+Support following metrics:
+
+-  L1 loss
+
+-  L2 loss
+
+-  Log loss
+
+-  Classification error rate
+
+-  AUC
+
+-  NDCG
+
+-  Multi class log loss
+
+-  Multi class error rate
+
+For more details, please refer to `Parameters <./Parameters.rst#metric-parameters>`__.
+
+Other Features
+--------------
+
+-  Limit ``max_depth`` of tree while grows tree leaf-wise
+
+-  `DART <https://arxiv.org/abs/1505.01866>`__
+
+-  L1/L2 regularization
+
+-  Bagging
+
+-  Column(feature) sub-sample
+
+-  Continued train with input GBDT model
+
+-  Continued train with the input score file
+
+-  Weighted training
+
+-  Validation metric output during training
+
+-  Multi validation data
+
+-  Multi metrics
+
+-  Early stopping (both training and prediction)
+
+-  Prediction for leaf index
+
+For more details, please refer to `Parameters <./Parameters.rst>`__.
+
+References
+----------
+
+[1] Mehta, Manish, Rakesh Agrawal, and Jorma Rissanen. "SLIQ: A fast scalable classifier for data mining." International Conference on Extending Database Technology. Springer Berlin Heidelberg, 1996.
+
+[2] Shafer, John, Rakesh Agrawal, and Manish Mehta. "SPRINT: A scalable parallel classifier for data mining." Proc. 1996 Int. Conf. Very Large Data Bases. 1996.
+
+[3] Ranka, Sanjay, and V. Singh. "CLOUDS: A decision tree classifier for large datasets." Proceedings of the 4th Knowledge Discovery and Data Mining Conference. 1998.
+
+[4] Machado, F. P. "Communication and memory efficient parallel decision tree construction." (2003).
+
+[5] Li, Ping, Qiang Wu, and Christopher J. Burges. "Mcrank: Learning to rank using multiple classification and gradient boosting." Advances in neural information processing systems. 2007.
+
+[6] Shi, Haijian. "Best-first decision tree learning." Diss. The University of Waikato, 2007.
+
+[7] Walter D. Fisher. "`On Grouping for Maximum Homogeneity`_." Journal of the American Statistical Association. Vol. 53, No. 284 (Dec., 1958), pp. 789-798.
+
+[8] Thakur, Rajeev, Rolf Rabenseifner, and William Gropp. "`Optimization of collective communication operations in MPICH`_." International Journal of High Performance Computing Applications 19.1 (2005): 49-66.
+
+[9] Qi Meng, Guolin Ke, Taifeng Wang, Wei Chen, Qiwei Ye, Zhi-Ming Ma, Tieyan Liu. "`A Communication-Efficient Parallel Algorithm for Decision Tree`_." Advances in Neural Information Processing Systems 29 (NIPS 2016).
+
+[10] Huan Zhang, Si Si and Cho-Jui Hsieh. "`GPU Acceleration for Large-scale Tree Boosting`_." arXiv:1706.08359, 2017.
+
+.. _On Grouping for Maximum Homogeneity: http://amstat.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479
+
+.. _Optimization of collective communication operations in MPICH: http://wwwi10.lrr.in.tum.de/~gerndt/home/Teaching/HPCSeminar/mpich_multi_coll.pdf
+
+.. _A Communication-Efficient Parallel Algorithm for Decision Tree: http://papers.nips.cc/paper/6381-a-communication-efficient-parallel-algorithm-for-decision-tree
+
+.. _GPU Acceleration for Large-scale Tree Boosting: https://arxiv.org/abs/1706.08359
diff --git a/docs/GPU-Performance.rst b/docs/GPU-Performance.rst
index 0fc252cc4..8df1ea366 100644
--- a/docs/GPU-Performance.rst
+++ b/docs/GPU-Performance.rst
@@ -161,7 +161,8 @@ For most datasets, using 63 bins is sufficient.
 
 We record the wall clock time after 500 iterations, as shown in the figure below:
 
-|Performance Comparison|
+.. image:: ./_static/images/gpu-performance-comparison.png
+   :align: center
 
 When using a GPU, it is advisable to use a bin size of 63 rather than 255, because it can speed up training significantly without noticeably affecting accuracy.
 On CPU, using a smaller bin size only marginally improves performance, sometimes even slows down training,
@@ -206,6 +207,4 @@ Huan Zhang, Si Si and Cho-Jui Hsieh. `GPU Acceleration for Large-scale Tree Boos
 
 .. _0bb4a82: https://github.com/Microsoft/LightGBM/commit/0bb4a82
 
-.. |Performance Comparison| image:: ./_static/images/gpu-performance-comparison.png
-
 .. _GPU Acceleration for Large-scale Tree Boosting: https://arxiv.org/abs/1706.08359
diff --git a/docs/GPU-Targets.rst b/docs/GPU-Targets.rst
index 4540dcd06..4448fe6b1 100644
--- a/docs/GPU-Targets.rst
+++ b/docs/GPU-Targets.rst
@@ -1,9 +1,8 @@
 GPU Targets Table
 =================
 
-When using OpenCL SDKs, targeting CPU and GPU at the same time is
-sometimes possible. This is especially true for Intel OpenCL SDK and AMD
-APP SDK.
+When using OpenCL SDKs, targeting CPU and GPU at the same time is sometimes possible.
+This is especially true for Intel OpenCL SDK and AMD APP SDK.
 
 You can find below a table of correspondence:
 
@@ -22,8 +21,7 @@ Legend:
 -  \* Not usable directly.
 -  \*\* Reported as unsupported in public forums.
 
-AMD GPUs using Intel SDK for OpenCL is not a typo, nor AMD APP SDK
-compatibility with CPUs.
+AMD GPUs using Intel SDK for OpenCL is not a typo, nor AMD APP SDK compatibility with CPUs.
 
 --------------
 
@@ -36,8 +34,7 @@ We present the following scenarii:
 -  Single CPU and GPU (even with integrated graphics)
 -  Multiple CPU/GPU
 
-We provide test R code below, but you can use the language of your
-choice with the examples of your choices:
+We provide test R code below, but you can use the language of your choice with the examples of your choices:
 
 .. code:: r
 
@@ -73,15 +70,13 @@ Using a bad ``gpu_device_id`` is not critical, as it will fallback to:
 -  ``gpu_device_id = 0`` if using ``gpu_platform_id = 0``
 -  ``gpu_device_id = 1`` if using ``gpu_platform_id = 1``
 
-However, using a bad combination of ``gpu_platform_id`` and
-``gpu_device_id`` will lead to a **crash** (you will lose your entire
-session content). Beware of it.
+However, using a bad combination of ``gpu_platform_id`` and ``gpu_device_id`` will lead to a **crash** (you will lose your entire session content).
+Beware of it.
 
 CPU Only Architectures
 ----------------------
 
-When you have a single device (one CPU), OpenCL usage is
-straightforward: ``gpu_platform_id = 0``, ``gpu_device_id = 0``
+When you have a single device (one CPU), OpenCL usage is straightforward: ``gpu_platform_id = 0``, ``gpu_device_id = 0``
 
 This will use the CPU with OpenCL, even though it says it says GPU.
 
@@ -124,18 +119,15 @@ Example:
 Single CPU and GPU (even with integrated graphics)
 --------------------------------------------------
 
-If you have integrated graphics card (Intel HD Graphics) and a dedicated
-graphics card (AMD, NVIDIA), the dedicated graphics card will
-automatically override the integrated graphics card. The workaround is
-to disable your dedicated graphics card to be able to use your
-integrated graphics card.
+If you have integrated graphics card (Intel HD Graphics) and a dedicated graphics card (AMD, NVIDIA),
+the dedicated graphics card will automatically override the integrated graphics card.
+The workaround is to disable your dedicated graphics card to be able to use your integrated graphics card.
 
-When you have multiple devices (one CPU and one GPU), the order is
-usually the following:
+When you have multiple devices (one CPU and one GPU), the order is usually the following:
+
+-  GPU: ``gpu_platform_id = 0``, ``gpu_device_id = 0``,
+   sometimes it is usable using ``gpu_platform_id = 1``, ``gpu_device_id = 1`` but at your own risk!
 
--  GPU: ``gpu_platform_id = 0``, ``gpu_device_id = 0``, sometimes it is
-   usable using ``gpu_platform_id = 1``, ``gpu_device_id = 1`` but at
-   your own risk!
 -  CPU: ``gpu_platform_id = 0``, ``gpu_device_id = 1``
 
 Example of GPU (``gpu_platform_id = 0``, ``gpu_device_id = 0``):
@@ -209,8 +201,7 @@ Example of CPU (``gpu_platform_id = 0``, ``gpu_device_id = 1``):
     [LightGBM] [Info] Trained a tree with leaves=7 and max_depth=5
     [2]:    test's rmse:0
 
-When using a wrong ``gpu_device_id``, it will automatically fallback to
-``gpu_device_id = 0``:
+When using a wrong ``gpu_device_id``, it will automatically fallback to ``gpu_device_id = 0``:
 
 .. code:: r
 
@@ -245,8 +236,7 @@ When using a wrong ``gpu_device_id``, it will automatically fallback to
     [LightGBM] [Info] Trained a tree with leaves=7 and max_depth=5
     [2]:    test's rmse:0
 
-Do not ever run under the following scenario as it is known to crash
-even if it says it is using the CPU because it is NOT the case:
+Do not ever run under the following scenario as it is known to crash even if it says it is using the CPU because it is NOT the case:
 
 -  One CPU and one GPU
 -  ``gpu_platform_id = 1``, ``gpu_device_id = 0``
@@ -284,13 +274,12 @@ even if it says it is using the CPU because it is NOT the case:
 Multiple CPU and GPU
 --------------------
 
-If you have multiple devices (multiple CPUs and multiple GPUs), you will
-have to test different ``gpu_device_id`` and different
-``gpu_platform_id`` values to find out the values which suits the
-CPU/GPU you want to use. Keep in mind that using the integrated graphics
-card is not directly possible without disabling every dedicated graphics
-card.
+If you have multiple devices (multiple CPUs and multiple GPUs),
+you will have to test different ``gpu_device_id`` and different ``gpu_platform_id`` values to find out the values which suits the CPU/GPU you want to use.
+Keep in mind that using the integrated graphics card is not directly possible without disabling every dedicated graphics card.
 
 .. _Intel SDK for OpenCL: https://software.intel.com/en-us/articles/opencl-drivers
+
 .. _AMD APP SDK: http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/
+
 .. _NVIDIA CUDA Toolkit: https://developer.nvidia.com/cuda-downloads
diff --git a/docs/GPU-Tutorial.md b/docs/GPU-Tutorial.md
deleted file mode 100644
index 72652c73e..000000000
--- a/docs/GPU-Tutorial.md
+++ /dev/null
@@ -1,179 +0,0 @@
-LightGBM GPU Tutorial
-=====================
-
-The purpose of this document is to give you a quick step-by-step tutorial on GPU training.
-
-For Windows, please see [GPU Windows Tutorial](./GPU-Windows.md).
-
-We will use the GPU instance on [Microsoft Azure cloud computing platform](https://azure.microsoft.com/) for demonstration, but you can use any machine with modern AMD or NVIDIA GPUs.
-
-GPU Setup
----------
-
-You need to launch a `NV` type instance on Azure (available in East US, North Central US, South Central US, West Europe and Southeast Asia zones) and select Ubuntu 16.04 LTS as the operating system.
-
-For testing, the smallest `NV6` type virtual machine is sufficient, which includes 1/2 M60 GPU, with 8 GB memory, 180 GB/s memory bandwidth and 4,825 GFLOPS peak computation power. Don't use the `NC` type instance as the GPUs (K80) are based on an older architecture (Kepler).
-
-First we need to install minimal NVIDIA drivers and OpenCL development environment:
-
-```
-sudo apt-get update
-sudo apt-get install --no-install-recommends nvidia-375
-sudo apt-get install --no-install-recommends nvidia-opencl-icd-375 nvidia-opencl-dev opencl-headers
-```
-
-After installing the drivers you need to restart the server.
-
-```
-sudo init 6
-```
-
-After about 30 seconds, the server should be up again.
-
-If you are using a AMD GPU, you should download and install the [AMDGPU-Pro](http://support.amd.com/en-us/download/linux) driver and also install package `ocl-icd-libopencl1` and `ocl-icd-opencl-dev`.
-
-Build LightGBM
---------------
-
-Now install necessary building tools and dependencies:
-
-```
-sudo apt-get install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev
-```
-
-The NV6 GPU instance has a 320 GB ultra-fast SSD mounted at /mnt. Let's use it as our workspace (skip this if you are using your own machine):
-
-```
-sudo mkdir -p /mnt/workspace
-sudo chown $(whoami):$(whoami) /mnt/workspace
-cd /mnt/workspace
-```
-
-Now we are ready to checkout LightGBM and compile it with GPU support:
-
-```
-git clone --recursive https://github.com/Microsoft/LightGBM
-cd LightGBM
-mkdir build ; cd build
-cmake -DUSE_GPU=1 .. 
-make -j$(nproc)
-cd ..
-```
-
-You will see two binaries are generated, `lightgbm` and `lib_lightgbm.so`.
-
-If you are building on OSX, you probably need to remove macro `BOOST_COMPUTE_USE_OFFLINE_CACHE` in `src/treelearner/gpu_tree_learner.h` to avoid a known crash bug in Boost.Compute.
-
-Install Python Interface (optional)
------------------------------------
-
-If you want to use the Python interface of LightGBM, you can install it now (along with some necessary Python-package dependencies):
-
-```
-sudo apt-get -y install python-pip
-sudo -H pip install setuptools numpy scipy scikit-learn -U
-cd python-package/
-sudo python setup.py install
-cd ..
-```
-
-You need to set an additional parameter `"device" : "gpu"` (along with your other options like `learning_rate`, `num_leaves`, etc) to use GPU in Python.
-
-You can read our [Python Guide](https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide) for more information on how to use the Python interface.
-
-Dataset Preparation
--------------------
-
-Using the following commands to prepare the Higgs dataset:
-
-```
-git clone https://github.com/guolinke/boosting_tree_benchmarks.git
-cd boosting_tree_benchmarks/data
-wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
-gunzip HIGGS.csv.gz
-python higgs2libsvm.py
-cd ../..
-ln -s boosting_tree_benchmarks/data/higgs.train
-ln -s boosting_tree_benchmarks/data/higgs.test
-```
-
-Now we create a configuration file for LightGBM by running the following commands (please copy the entire block and run it as a whole):
-
-```
-cat > lightgbm_gpu.conf <<EOF
-max_bin = 63
-num_leaves = 255
-num_iterations = 50
-learning_rate = 0.1
-tree_learner = serial
-task = train
-is_training_metric = false
-min_data_in_leaf = 1
-min_sum_hessian_in_leaf = 100
-ndcg_eval_at = 1,3,5,10
-sparse_threshold = 1.0
-device = gpu
-gpu_platform_id = 0
-gpu_device_id = 0
-EOF
-echo "num_threads=$(nproc)" >> lightgbm_gpu.conf
-```
-
-GPU is enabled in the configuration file we just created by setting `device=gpu`. It will use the first GPU installed on the system by default (`gpu_platform_id=0` and `gpu_device_id=0`).
-
-Run Your First Learning Task on GPU
------------------------------------
-
-Now we are ready to start GPU training! First we want to verify the GPU works correctly. Run the following command to train on GPU, and take a note of the AUC after 50 iterations:
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc
-```
-
-Now train the same dataset on CPU using the following command. You should observe a similar AUC:
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc device=cpu
-```
-
-Now we can make a speed test on GPU without calculating AUC after each iteration.
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=binary metric=auc
-```
-
-Speed test on CPU:
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=binary metric=auc device=cpu
-```
-
-You should observe over three times speedup on this GPU.
-
-The GPU acceleration can be used on other tasks/metrics (regression, multi-class classification, ranking, etc) as well. For example, we can train the Higgs dataset on GPU as a regression task:
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=regression_l2 metric=l2
-```
-
-Also, you can compare the training speed with CPU:
-
-```
-./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=regression_l2 metric=l2 device=cpu
-```
-
-Further Reading
----------------
-
-[GPU Tuning Guide and Performance Comparison](./GPU-Performance.rst)
-
-[GPU SDK Correspondence and Device Targeting Table](./GPU-Targets.rst)
-
-[GPU Windows Tutorial](./GPU-Windows.md)
-
-Reference
----------
-
-Please kindly cite the following article in your publications if you find the GPU acceleration useful:
-
-Huan Zhang, Si Si and Cho-Jui Hsieh. [GPU Acceleration for Large-scale Tree Boosting](https://arxiv.org/abs/1706.08359). arXiv:1706.08359, 2017.
diff --git a/docs/GPU-Tutorial.rst b/docs/GPU-Tutorial.rst
new file mode 100644
index 000000000..c30307c76
--- /dev/null
+++ b/docs/GPU-Tutorial.rst
@@ -0,0 +1,196 @@
+LightGBM GPU Tutorial
+=====================
+
+The purpose of this document is to give you a quick step-by-step tutorial on GPU training.
+
+For Windows, please see `GPU Windows Tutorial <./GPU-Windows.rst>`__.
+
+We will use the GPU instance on `Microsoft Azure cloud computing platform`_ for demonstration,
+but you can use any machine with modern AMD or NVIDIA GPUs.
+
+GPU Setup
+---------
+
+You need to launch a ``NV`` type instance on Azure (available in East US, North Central US, South Central US, West Europe and Southeast Asia zones)
+and select Ubuntu 16.04 LTS as the operating system.
+
+For testing, the smallest ``NV6`` type virtual machine is sufficient, which includes 1/2 M60 GPU, with 8 GB memory, 180 GB/s memory bandwidth and 4,825 GFLOPS peak computation power.
+Don't use the ``NC`` type instance as the GPUs (K80) are based on an older architecture (Kepler).
+
+First we need to install minimal NVIDIA drivers and OpenCL development environment:
+
+::
+
+    sudo apt-get update
+    sudo apt-get install --no-install-recommends nvidia-375
+    sudo apt-get install --no-install-recommends nvidia-opencl-icd-375 nvidia-opencl-dev opencl-headers
+
+After installing the drivers you need to restart the server.
+
+::
+
+    sudo init 6
+
+After about 30 seconds, the server should be up again.
+
+If you are using a AMD GPU, you should download and install the `AMDGPU-Pro`_ driver and also install package ``ocl-icd-libopencl1`` and ``ocl-icd-opencl-dev``.
+
+Build LightGBM
+--------------
+
+Now install necessary building tools and dependencies:
+
+::
+
+    sudo apt-get install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev
+
+The ``NV6`` GPU instance has a 320 GB ultra-fast SSD mounted at ``/mnt``.
+Let's use it as our workspace (skip this if you are using your own machine):
+
+::
+
+    sudo mkdir -p /mnt/workspace
+    sudo chown $(whoami):$(whoami) /mnt/workspace
+    cd /mnt/workspace
+
+Now we are ready to checkout LightGBM and compile it with GPU support:
+
+::
+
+    git clone --recursive https://github.com/Microsoft/LightGBM
+    cd LightGBM
+    mkdir build ; cd build
+    cmake -DUSE_GPU=1 .. 
+    make -j$(nproc)
+    cd ..
+
+You will see two binaries are generated, ``lightgbm`` and ``lib_lightgbm.so``.
+
+If you are building on OSX, you probably need to remove macro ``BOOST_COMPUTE_USE_OFFLINE_CACHE`` in ``src/treelearner/gpu_tree_learner.h`` to avoid a known crash bug in Boost.Compute.
+
+Install Python Interface (optional)
+-----------------------------------
+
+If you want to use the Python interface of LightGBM, you can install it now (along with some necessary Python-package dependencies):
+
+::
+
+    sudo apt-get -y install python-pip
+    sudo -H pip install setuptools numpy scipy scikit-learn -U
+    cd python-package/
+    sudo python setup.py install --precompile
+    cd ..
+
+You need to set an additional parameter ``"device" : "gpu"`` (along with your other options like ``learning_rate``, ``num_leaves``, etc) to use GPU in Python.
+
+You can read our `Python Package Examples`_ for more information on how to use the Python interface.
+
+Dataset Preparation
+-------------------
+
+Using the following commands to prepare the Higgs dataset:
+
+::
+
+    git clone https://github.com/guolinke/boosting_tree_benchmarks.git
+    cd boosting_tree_benchmarks/data
+    wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
+    gunzip HIGGS.csv.gz
+    python higgs2libsvm.py
+    cd ../..
+    ln -s boosting_tree_benchmarks/data/higgs.train
+    ln -s boosting_tree_benchmarks/data/higgs.test
+
+Now we create a configuration file for LightGBM by running the following commands (please copy the entire block and run it as a whole):
+
+::
+
+    cat > lightgbm_gpu.conf <<EOF
+    max_bin = 63
+    num_leaves = 255
+    num_iterations = 50
+    learning_rate = 0.1
+    tree_learner = serial
+    task = train
+    is_training_metric = false
+    min_data_in_leaf = 1
+    min_sum_hessian_in_leaf = 100
+    ndcg_eval_at = 1,3,5,10
+    sparse_threshold = 1.0
+    device = gpu
+    gpu_platform_id = 0
+    gpu_device_id = 0
+    EOF
+    echo "num_threads=$(nproc)" >> lightgbm_gpu.conf
+
+GPU is enabled in the configuration file we just created by setting ``device=gpu``.
+It will use the first GPU installed on the system by default (``gpu_platform_id=0`` and ``gpu_device_id=0``).
+
+Run Your First Learning Task on GPU
+-----------------------------------
+
+Now we are ready to start GPU training!
+
+First we want to verify the GPU works correctly.
+Run the following command to train on GPU, and take a note of the AUC after 50 iterations:
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc
+
+Now train the same dataset on CPU using the following command. You should observe a similar AUC:
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc device=cpu
+
+Now we can make a speed test on GPU without calculating AUC after each iteration.
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=binary metric=auc
+
+Speed test on CPU:
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=binary metric=auc device=cpu
+
+You should observe over three times speedup on this GPU.
+
+The GPU acceleration can be used on other tasks/metrics (regression, multi-class classification, ranking, etc) as well.
+For example, we can train the Higgs dataset on GPU as a regression task:
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=regression_l2 metric=l2
+
+Also, you can compare the training speed with CPU:
+
+::
+
+    ./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=regression_l2 metric=l2 device=cpu
+
+Further Reading
+---------------
+
+- `GPU Tuning Guide and Performance Comparison <./GPU-Performance.rst>`__
+
+- `GPU SDK Correspondence and Device Targeting Table <./GPU-Targets.rst>`__
+
+- `GPU Windows Tutorial <./GPU-Windows.rst>`__
+
+Reference
+---------
+
+Please kindly cite the following article in your publications if you find the GPU acceleration useful:
+
+Huan Zhang, Si Si and Cho-Jui Hsieh. "`GPU Acceleration for Large-scale Tree Boosting`_." arXiv:1706.08359, 2017.
+
+.. _Microsoft Azure cloud computing platform: https://azure.microsoft.com/
+
+.. _AMDGPU-Pro: http://support.amd.com/en-us/download/linux
+
+.. _Python Package Examples: https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
+
+.. _GPU Acceleration for Large-scale Tree Boosting: https://arxiv.org/abs/1706.08359
diff --git a/docs/GPU-Windows.md b/docs/GPU-Windows.md
deleted file mode 100644
index 29db5ad3d..000000000
--- a/docs/GPU-Windows.md
+++ /dev/null
@@ -1,450 +0,0 @@
-GPU Windows Compilation
-=======================
-
-This guide is for the MinGW build.
-
-For the MSVC (Visual Studio) build with GPU, please refer to [Installation Guide](./Installation-Guide.rst). (We recommend you to use this since it is much easier).
-
-# Install LightGBM GPU version in Windows (CLI / R / Python), using MinGW/gcc
-
-This is for a vanilla installation of Boost, including full compilation steps from source without precompiled libraries.
-
-Installation steps (depends on what you are going to do):
-
-* Install the appropriate OpenCL SDK
-* Install MinGW
-* Install Boost
-* Install Git
-* Install CMake
-* Create LightGBM binaries
-* Debugging LightGBM in CLI (if GPU is crashing or any other crash reason)
-
-If you wish to use another compiler like Visual Studio C++ compiler, you need to adapt the steps to your needs.
-
-For this compilation tutorial, I am using AMD SDK for our OpenCL steps. However, you are free to use any OpenCL SDK you want, you just need to adjust the PATH correctly.
-
-You will also need administrator rights. This will not work without them.
-
-At the end, you can restore your original PATH.
-
----
-
-## Modifying PATH (for newbies)
-
-To modify PATH, just follow the pictures after going to the `Control Panel`:
-
-![System](./_static/images/screenshot-system.png)
-
-Then, go to `Advanced` > `Environment Variables...`:
-
-![Advanced System Settings](./_static/images/screenshot-advanced-system-settings.png)
-
-Under `System variables`, the variable `Path`:
-
-![Environment Variables](./_static/images/screenshot-environment-variables.png)
-
----
-
-### Antivirus Performance Impact
-
-Does not apply to you if you do not use a third-party antivirus nor the default preinstalled antivirus on Windows.
-
-**Windows Defender or any other antivirus will have a significant impact on the speed you will be able to perform the steps.** It is recommended to **turn them off temporarily** until you finished with building and setting up everything, then turn them back on, if you are using them.
-
----
-
-## OpenCL SDK Installation
-
-Installing the appropriate OpenCL SDK requires you to download the correct vendor source SDK. You need to know on what you are going to use LightGBM!:
-
-* For running on Intel, get [Intel SDK for OpenCL](https://software.intel.com/en-us/articles/opencl-drivers) (NOT RECOMMENDED)
-* For running on AMD, get [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
-* For running on NVIDIA, get [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)
-
-Further reading and correspondnce table (especially if you intend to use cross-platform devices, like Intel CPU with AMD APP SDK): [GPU SDK Correspondence and Device Targeting Table](./GPU-Targets.rst).
-
-Warning: using Intel OpenCL is not recommended and may crash your machine due to being non compliant to OpenCL standards. If your objective is to use LightGBM + OpenCL on CPU, please use AMD APP SDK instead (it can run also on Intel CPUs without any issues).
-
----
-
-## MinGW Correct Compiler Selection
-
-If you are expecting to use LightGBM without R, you need to install MinGW. Installing MinGW is straightforward, download [this](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe).
-
-Make sure you are using the x86_64 architecture, and do not modify anything else. You may choose a version other than the most recent one if you need a previous MinGW version.
-
-![MinGW installation](./_static/images/screenshot-mingw-installation.png)
-
-Then, add to your PATH the following (to adjust to your MinGW version):
-
-```
-C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin
-```
-
-**Warning: R users (even if you do not want LightGBM for R)**
-
-**If you have RTools and MinGW installed, and wish to use LightGBM in R, get rid of MinGW from PATH (to keep: `c:\Rtools\bin;c:\Rtools\mingw_32\bin` for 32-bit R installation, `c:\Rtools\bin;c:\Rtools\mingw_64\bin` for 64-bit R installation).**
-
-You can check which MinGW version you are using by running the following in a command prompt: `gcc -v`:
-
-![R MinGW used](./_static/images/screenshot-r-mingw-used.png)
-
-To check whether you need 32-bit or 64-bit MinGW for R, install LightGBM as usual and check for the following:
-
-```r
-* installing *source* package 'lightgbm' ...
-** libs
-c:/Rtools/mingw_64/bin/g++
-```
-
-If it says `mingw_64` then you need the 64-bit version (PATH with `c:\Rtools\bin;c:\Rtools\mingw_64\bin`), otherwise you need the 32-bit version (`c:\Rtools\bin;c:\Rtools\mingw_32\bin`), the latter being a very rare and untested case.
-
-Quick installation of LightGBM can be done using:
-
-```r
-devtools::install_github("Microsoft/LightGBM", subdir = "R-package")
-```
-
----
-
-## Boost Compilation
-
-Installing Boost requires to download Boost and to install it. It takes about 10 minutes to several hours depending on your CPU speed and network speed.
-
-We will assume an installation in `C:\boost` and a general installation (like in Unix variants: without versioning and without type tags).
-
-There is one mandatory step to check: the compiler.
-
-* **Warning: if you want the R installation**: If you have already MinGW in your PATH variable, get rid of it (you will link to the wrong compiler otherwise).
-* **Warning: if you want the CLI installation**: if you have already Rtools in your PATH variable, get rid of it (you will link to the wrong compiler otherwise).
-
-* R installation must have Rtools in PATH
-* CLI / Python installation must have MinGW (not Rtools) in PATH
-
-In addition, assuming you are going to use `C:\boost` for the folder path, you should add now already the following to PATH: `C:\boost\boost-build\bin;C:\boost\boost-build\include\boost`. Adjust `C:\boost` if you install it elsewhere.
-
-We can now start downloading and compiling the required Boost libraries:
-
-* Download Boost here: http://www.boost.org/users/history/version_1_63_0.html (boost_1_63_0.zip).
-* Extract the archive to `C:\boost`.
-* Open a command prompt, and run `cd C:\boost\boost_1_63_0\tools\build`.
-* In command prompt, run `bootstrap.bat gcc`.
-* In command prompt, run `b2 install --prefix="C:\boost\boost-build" toolset=gcc`.
-* In command prompt, run `cd C:\boost\boost_1_63_0`.
-
-To build the Boost libraries, you have two choices for command prompt:
-
-* If you have only one single core, you can use the default
-  ```
-  b2 install --build_dir="C:\boost\boost-build" --prefix="C:\boost\boost-build" toolset=gcc --with=filesystem,system threading=multi --layout=system release
-  ```
-* If you want to do a multithreaded library building (faster), add `-j N` by replacing N by the number of cores/threads you have. For instance, for 2 cores, you would do
-  ```
-  b2 install --build_dir="C:\boost\boost-build" --prefix="C:\boost\boost-build" toolset=gcc --with=filesystem,system threading=multi --layout=system release -j 2
-  ```
-
-Ignore all the errors popping up, like Python, etc., they do not matter for us.
-
-Your folder should look like this at the end (not fully detailed):
-
-```
-- C
-  |--- boost
-  |------ boost_1_63_0
-  |--------- some folders and files
-  |------ boost-build
-  |--------- bin
-  |--------- include
-  |------------ boost
-  |--------- lib
-  |--------- share
-```
-
-This is what you should (approximately) get at the end of Boost compilation:
-
-![Boost compiled](./_static/images/screenshot-boost-compiled.png)
-
-If you are getting an error:
-
-* Wipe your boost directory
-* Close the command prompt
-* Make sure you added `C:\boost\boost-build\bin;C:\boost\boost-build\include\boost` to your PATH (adjust accordingly if you use another folder)
-* Do the boost compilation steps again (extract => command prompt => `cd` => `bootstrap` => `b2` => `cd` => `b2`
-
----
-
-## Git Installation
-
-Installing Git for Windows is straightforward, use the following [link](https://git-for-windows.github.io/).
-
-![git for Windows](./_static/images/screenshot-git-for-windows.png)
-
-Then, click on the big Download button, you can't miss it.
-
-Now, we can fetch LightGBM repository for GitHub. Run Git Bash and the following command:
-
-```
-cd C:/
-mkdir github_repos
-cd github_repos
-git clone --recursive https://github.com/Microsoft/LightGBM
-```
-
-Your LightGBM repository copy should now be under `C:\github_repos\LightGBM`. You are free to use any folder you want, but you have to adapt.
-
-Keep Git Bash open.
-
----
-
-## CMake Installation, Configuration, Generation
-
-**CLI / Python users only**
-
-Installing CMake requires one download first and then a lot of configuration for LightGBM:
-
-![Downloading CMake](./_static/images/screenshot-downloading-cmake.png)
-
-* Download CMake 3.8.0 here: https://cmake.org/download/.
-* Install CMake.
-* Run cmake-gui.
-* Select the folder where you put LightGBM for `Where is the source code`, default using our steps would be `C:/github_repos/LightGBM`.
-* Copy the folder name, and add `/build` for "Where to build the binaries", default using our steps would be `C:/github_repos/LightGBM/build`.
-* Click `Configure`.
-
-![Create directory](./_static/images/screenshot-create-directory.png)
-
-![MinGW makefiles to use](./_static/images/screenshot-mingw-makefiles-to-use.png)
-
-* Lookup for `USE_GPU` and check the checkbox
-
-![Use GPU](./_static/images/screenshot-use-gpu.png)
-
-* Click `Configure`
-
-You should get (approximately) the following after clicking Configure:
-
-![Configured LightGBM](./_static/images/screenshot-configured-lightgbm.png)
-
-```
-Looking for CL_VERSION_2_0
-Looking for CL_VERSION_2_0 - found
-Found OpenCL: C:/Windows/System32/OpenCL.dll (found version "2.0") 
-OpenCL include directory:C:/Program Files (x86)/AMD APP SDK/3.0/include
-Boost version: 1.63.0
-Found the following Boost libraries:
-  filesystem
-  system
-Configuring done
-```
-
-* Click `Generate` to get the following message:
-
-```
-Generating done
-```
-
-This is straightforward, as CMake is providing a large help into locating the correct elements.
-
----
-
-## LightGBM Compilation (CLI: final step)
-
-### Installation in CLI
-
-**CLI / Python users**
-
-Creating LightGBM libraries is very simple as all the important and hard steps were done before.
-
-You can do everything in the Git Bash console you left open:
-
-* If you closed Git Bash console previously, run this to get back to the build folder: `cd C:/github_repos/LightGBM/build`
-* If you did not close the Git Bash console previously, run this to get to the build folder: `cd LightGBM/build`
-* Setup MinGW as make using `alias make='mingw32-make'` (otherwise, beware error and name clash!).
-* In Git Bash, run `make` and see LightGBM being installing!
-
-![LightGBM with GPU support compiled](./_static/images/screenshot-lightgbm-with-gpu-support-compiled.png)
-
-If everything was done correctly, you now compiled CLI LightGBM with GPU support!
-
-### Testing in CLI
-
-You can now test LightGBM directly in CLI in a **command prompt** (not Git Bash):
-
-```
-cd C:/github_repos/LightGBM/examples/binary_classification
-"../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
-```
-
-![LightGBM in CLI with GPU](./_static/images/screenshot-lightgbm-in-cli-with-gpu.png)
-
-Congratulations for reaching this stage!
-
-To learn how to target a correct CPU or GPU for training, please see: [GPU SDK Correspondence and Device Targeting Table](./GPU-Targets.rst).
-
----
-
-## Debugging LightGBM Crashes in CLI
-
-Now that you compiled LightGBM, you try it... and you always see a segmentation fault or an undocumented crash with GPU support:
-
-![Segmentation Fault](./_static/images/screenshot-segmentation-fault.png)
-
-Please check you are using the right device and whether it works with the default `gpu_device_id = 0` and `gpu_platform_id = 0`. If it still does not work with the default values, then you should follow all the steps below.
-
-You will have to redo the compilation steps for LightGBM to add debugging mode. This involves:
-
-* Deleting `C:/github_repos/LightGBM/build` folder
-* Deleting `lightgbm.exe`, `lib_lightgbm.dll`, and `lib_lightgbm.dll.a` files
-
-![Files to remove](./_static/images/screenshot-files-to-remove.png)
-
-Once you removed the file, go into CMake, and follow the usual steps. Before clicking "Generate", click on "Add Entry":
-
-![Added manual entry in CMake](./_static/images/screenshot-added-manual-entry-in-cmake.png)
-
-In addition, click on Configure and Generate:
-
-![Configured and Generated CMake](./_static/images/screenshot-configured-and-generated-cmake.png)
-
-And then, follow the regular LightGBM CLI installation from there.
-
-Once you have installed LightGBM CLI, assuming your LightGBM is in `C:\github_repos\LightGBM`, open a command prompt and run the following:
-
-```
-gdb --args "../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
-```
-
-![Debug run](./_static/images/screenshot-debug-run.png)
-
-Type `run` and Enter key.
-
-You will probably get something similar to this:
-
-```
-[LightGBM] [Info] This is the GPU trainer!!
-[LightGBM] [Info] Total Bins 6143
-[LightGBM] [Info] Number of data: 7000, number of used features: 28
-[New Thread 105220.0x1a62c]
-[LightGBM] [Info] Using GPU Device: Oland, Vendor: Advanced Micro Devices, Inc.
-[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
-
-Program received signal SIGSEGV, Segmentation fault.
-0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
-(gdb) 
-```
-
-There, write `backtrace` and Enter key as many times as gdb requests two choices:
-
-```
-Program received signal SIGSEGV, Segmentation fault.
-0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
-(gdb) backtrace
-#0  0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
-#1  0x000000000048bbe5 in std::char_traits<char>::length (__s=0x0)
-    at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/char_traits.h:267
-#2  std::operator+<char, std::char_traits<char>, std::allocator<char> > (__rhs="\\", __lhs=0x0)
-    at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/basic_string.tcc:1157
-#3  boost::compute::detail::appdata_path[abi:cxx11]() () at C:/boost/boost-build/include/boost/compute/detail/path.hpp:38
-#4  0x000000000048eec3 in boost::compute::detail::program_binary_path (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", create=create@entry=false)
-    at C:/boost/boost-build/include/boost/compute/detail/path.hpp:46
-#5  0x00000000004913de in boost::compute::program::load_program_binary (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", ctx=...)
-    at C:/boost/boost-build/include/boost/compute/program.hpp:605
-#6  0x0000000000490ece in boost::compute::program::build_with_source (
-    source="\n#ifndef _HISTOGRAM_256_KERNEL_\n#define _HISTOGRAM_256_KERNEL_\n\n#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n#pragma OPENC
-L EXTENSION cl_khr_global_int32_base_atomics : enable\n\n//"..., context=...,
-    options=" -D POWER_FEATURE_WORKGROUPS=5 -D USE_CONSTANT_BUF=0 -D USE_DP_FLOAT=0 -D CONST_HESSIAN=0 -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros -c
-l-fast-relaxed-math") at C:/boost/boost-build/include/boost/compute/program.hpp:549
-#7  0x0000000000454339 in LightGBM::GPUTreeLearner::BuildGPUKernels () at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:583
-#8  0x00000000636044f2 in libgomp-1!GOMP_parallel () from C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin\libgomp-1.dll
-#9  0x0000000000455e7e in LightGBM::GPUTreeLearner::BuildGPUKernels (this=this@entry=0x3b9cac0)
-    at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:569
-#10 0x0000000000457b49 in LightGBM::GPUTreeLearner::InitGPU (this=0x3b9cac0, platform_id=<optimized out>, device_id=<optimized out>)
-    at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:720
-#11 0x0000000000410395 in LightGBM::GBDT::ResetTrainingData (this=0x1f26c90, config=<optimized out>, train_data=0x1f28180, objective_function=0x1f280e0,
-    training_metrics=std::vector of length 2, capacity 2 = {...}) at C:\LightGBM\src\boosting\gbdt.cpp:98
-#12 0x0000000000402e93 in LightGBM::Application::InitTrain (this=this@entry=0x23f9d0) at C:\LightGBM\src\application\application.cpp:213
----Type <return> to continue, or q <return> to quit---
-#13 0x00000000004f0b55 in LightGBM::Application::Run (this=0x23f9d0) at C:/LightGBM/include/LightGBM/application.h:84
-#14 main (argc=6, argv=0x1f21e90) at C:\LightGBM\src\main.cpp:7
-```
-
-Right-click the command prompt, click "Mark", and select all the text from the first line (with the command prompt containing gdb) to the last line printed, containing all the log, such as:
-
-```
-C:\LightGBM\examples\binary_classification>gdb --args "../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
-GNU gdb (GDB) 7.10.1
-Copyright (C) 2015 Free Software Foundation, Inc.
-License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
-This is free software: you are free to change and redistribute it.
-There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
-and "show warranty" for details.
-This GDB was configured as "x86_64-w64-mingw32".
-Type "show configuration" for configuration details.
-For bug reporting instructions, please see:
-<http://www.gnu.org/software/gdb/bugs/>.
-Find the GDB manual and other documentation resources online at:
-<http://www.gnu.org/software/gdb/documentation/>.
-For help, type "help".
-Type "apropos word" to search for commands related to "word"...
-Reading symbols from ../../lightgbm.exe...done.
-(gdb) run
-Starting program: C:\LightGBM\lightgbm.exe "config=train.conf" "data=binary.train" "valid=binary.test" "objective=binary" "device=gpu"
-[New Thread 105220.0x199b8]
-[New Thread 105220.0x783c]
-[Thread 105220.0x783c exited with code 0]
-[LightGBM] [Info] Finished loading parameters
-[New Thread 105220.0x19490]
-[New Thread 105220.0x1a71c]
-[New Thread 105220.0x19a24]
-[New Thread 105220.0x4fb0]
-[Thread 105220.0x4fb0 exited with code 0]
-[LightGBM] [Info] Loading weights...
-[New Thread 105220.0x19988]
-[Thread 105220.0x19988 exited with code 0]
-[New Thread 105220.0x1a8fc]
-[Thread 105220.0x1a8fc exited with code 0]
-[LightGBM] [Info] Loading weights...
-[New Thread 105220.0x1a90c]
-[Thread 105220.0x1a90c exited with code 0]
-[LightGBM] [Info] Finished loading data in 1.011408 seconds
-[LightGBM] [Info] Number of positive: 3716, number of negative: 3284
-[LightGBM] [Info] This is the GPU trainer!!
-[LightGBM] [Info] Total Bins 6143
-[LightGBM] [Info] Number of data: 7000, number of used features: 28
-[New Thread 105220.0x1a62c]
-[LightGBM] [Info] Using GPU Device: Oland, Vendor: Advanced Micro Devices, Inc.
-[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
-
-Program received signal SIGSEGV, Segmentation fault.
-0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
-(gdb) backtrace
-#0  0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
-#1  0x000000000048bbe5 in std::char_traits<char>::length (__s=0x0)
-    at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/char_traits.h:267
-#2  std::operator+<char, std::char_traits<char>, std::allocator<char> > (__rhs="\\", __lhs=0x0)
-    at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/basic_string.tcc:1157
-#3  boost::compute::detail::appdata_path[abi:cxx11]() () at C:/boost/boost-build/include/boost/compute/detail/path.hpp:38
-#4  0x000000000048eec3 in boost::compute::detail::program_binary_path (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", create=create@entry=false)
-    at C:/boost/boost-build/include/boost/compute/detail/path.hpp:46
-#5  0x00000000004913de in boost::compute::program::load_program_binary (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", ctx=...)
-    at C:/boost/boost-build/include/boost/compute/program.hpp:605
-#6  0x0000000000490ece in boost::compute::program::build_with_source (
-    source="\n#ifndef _HISTOGRAM_256_KERNEL_\n#define _HISTOGRAM_256_KERNEL_\n\n#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n#pragma OPENC
-L EXTENSION cl_khr_global_int32_base_atomics : enable\n\n//"..., context=...,
-    options=" -D POWER_FEATURE_WORKGROUPS=5 -D USE_CONSTANT_BUF=0 -D USE_DP_FLOAT=0 -D CONST_HESSIAN=0 -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros -c
-l-fast-relaxed-math") at C:/boost/boost-build/include/boost/compute/program.hpp:549
-#7  0x0000000000454339 in LightGBM::GPUTreeLearner::BuildGPUKernels () at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:583
-#8  0x00000000636044f2 in libgomp-1!GOMP_parallel () from C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin\libgomp-1.dll
-#9  0x0000000000455e7e in LightGBM::GPUTreeLearner::BuildGPUKernels (this=this@entry=0x3b9cac0)
-    at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:569
-#10 0x0000000000457b49 in LightGBM::GPUTreeLearner::InitGPU (this=0x3b9cac0, platform_id=<optimized out>, device_id=<optimized out>)
-    at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:720
-#11 0x0000000000410395 in LightGBM::GBDT::ResetTrainingData (this=0x1f26c90, config=<optimized out>, train_data=0x1f28180, objective_function=0x1f280e0,
-    training_metrics=std::vector of length 2, capacity 2 = {...}) at C:\LightGBM\src\boosting\gbdt.cpp:98
-#12 0x0000000000402e93 in LightGBM::Application::InitTrain (this=this@entry=0x23f9d0) at C:\LightGBM\src\application\application.cpp:213
----Type <return> to continue, or q <return> to quit---
-#13 0x00000000004f0b55 in LightGBM::Application::Run (this=0x23f9d0) at C:/LightGBM/include/LightGBM/application.h:84
-#14 main (argc=6, argv=0x1f21e90) at C:\LightGBM\src\main.cpp:7
-```
-
-And open an issue in GitHub [here](https://github.com/Microsoft/LightGBM/issues) with that log.
diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst
new file mode 100644
index 000000000..dd8ee36be
--- /dev/null
+++ b/docs/GPU-Windows.rst
@@ -0,0 +1,565 @@
+GPU Windows Compilation
+=======================
+
+This guide is for the MinGW build.
+
+For the MSVC (Visual Studio) build with GPU, please refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__.
+(We recommend you to use this since it is much easier).
+
+Install LightGBM GPU version in Windows (CLI / R / Python), using MinGW/gcc
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is for a vanilla installation of Boost, including full compilation steps from source without precompiled libraries.
+
+Installation steps (depends on what you are going to do):
+
+-  Install the appropriate OpenCL SDK
+
+-  Install MinGW
+
+-  Install Boost
+
+-  Install Git
+
+-  Install CMake
+
+-  Create LightGBM binaries
+
+-  Debugging LightGBM in CLI (if GPU is crashing or any other crash reason)
+
+If you wish to use another compiler like Visual Studio C++ compiler, you need to adapt the steps to your needs.
+
+For this compilation tutorial, we are using AMD SDK for our OpenCL steps.
+However, you are free to use any OpenCL SDK you want, you just need to adjust the PATH correctly.
+
+You will also need administrator rights. This will not work without them.
+
+At the end, you can restore your original PATH.
+
+--------------
+
+Modifying PATH (for newbies)
+----------------------------
+
+To modify PATH, just follow the pictures after going to the ``Control Panel``:
+
+.. image:: ./_static/images/screenshot-system.png
+   :align: center
+
+Then, go to ``Advanced`` > ``Environment Variables...``:
+
+.. image:: ./_static/images/screenshot-advanced-system-settings.png
+   :align: center
+
+Under ``System variables``, the variable ``Path``:
+
+.. image:: ./_static/images/screenshot-environment-variables.png
+   :align: center
+
+--------------
+
+Antivirus Performance Impact
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Does not apply to you if you do not use a third-party antivirus nor the default preinstalled antivirus on Windows.
+
+**Windows Defender or any other antivirus will have a significant impact on the speed you will be able to perform the steps.**
+It is recommended to **turn them off temporarily** until you finished with building and setting up everything, then turn them back on, if you are using them.
+
+--------------
+
+OpenCL SDK Installation
+-----------------------
+
+Installing the appropriate OpenCL SDK requires you to download the correct vendor source SDK.
+You need to know on what you are going to use LightGBM!:
+
+-  For running on Intel, get `Intel SDK for OpenCL`_ (NOT RECOMMENDED)
+
+-  For running on AMD, get `AMD APP SDK`_
+
+-  For running on NVIDIA, get `CUDA Toolkit`_
+
+Further reading and correspondnce table (especially if you intend to use cross-platform devices,
+like Intel CPU with AMD APP SDK): `GPU SDK Correspondence and Device Targeting Table <./GPU-Targets.rst>`__.
+
+**Warning**: using Intel OpenCL is not recommended and may crash your machine due to being non compliant to OpenCL standards.
+If your objective is to use LightGBM + OpenCL on CPU, please use AMD APP SDK instead (it can run also on Intel CPUs without any issues).
+
+--------------
+
+MinGW Correct Compiler Selection
+--------------------------------
+
+If you are expecting to use LightGBM without R, you need to install MinGW.
+Installing MinGW is straightforward, download `this`_.
+
+Make sure you are using the x86\_64 architecture, and do not modify anything else.
+You may choose a version other than the most recent one if you need a previous MinGW version.
+
+.. image:: ./_static/images/screenshot-mingw-installation.png
+   :align: center
+
+Then, add to your PATH the following (to adjust to your MinGW version):
+
+::
+
+    C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin
+
+**Warning**: R users (even if you do not want LightGBM for R)
+
+If you have RTools and MinGW installed, and wish to use LightGBM in R,
+get rid of MinGW from PATH (to keep: ``c:\Rtools\bin;c:\Rtools\mingw_32\bin`` for 32-bit R installation,
+``c:\Rtools\bin;c:\Rtools\mingw_64\bin`` for 64-bit R installation).
+
+You can check which MinGW version you are using by running the following in a command prompt: ``gcc -v``:
+
+.. image:: ./_static/images/screenshot-r-mingw-used.png
+   :align: center
+
+To check whether you need 32-bit or 64-bit MinGW for R, install LightGBM as usual and check for the following:
+
+.. code:: r
+
+    * installing *source* package 'lightgbm' ...
+    ** libs
+    c:/Rtools/mingw_64/bin/g++
+
+If it says ``mingw_64`` then you need the 64-bit version (PATH with ``c:\Rtools\bin;c:\Rtools\mingw_64\bin``),
+otherwise you need the 32-bit version (``c:\Rtools\bin;c:\Rtools\mingw_32\bin``), the latter being a very rare and untested case.
+
+Quick installation of LightGBM can be done using:
+
+.. code:: r
+
+    devtools::install_github("Microsoft/LightGBM", subdir = "R-package")
+
+--------------
+
+Boost Compilation
+-----------------
+
+Installing Boost requires to download Boost and to install it.
+It takes about 10 minutes to several hours depending on your CPU speed and network speed.
+
+We will assume an installation in ``C:\boost`` and a general installation (like in Unix variants: without versioning and without type tags).
+
+There is one mandatory step to check the compiler:
+
+-  **Warning**: if you want the R installation:
+   If you have already MinGW in your PATH variable, get rid of it (you will link to the wrong compiler otherwise).
+
+-  **Warning**: if you want the CLI installation:
+   If you have already Rtools in your PATH variable, get rid of it (you will link to the wrong compiler otherwise).
+
+-  R installation must have Rtools in PATH
+
+-  CLI / Python installation must have MinGW (not Rtools) in PATH
+
+In addition, assuming you are going to use ``C:\boost`` for the folder path,
+you should add now already the following to PATH: ``C:\boost\boost-build\bin``, ``C:\boost\boost-build\include\boost``.
+Adjust ``C:\boost`` if you install it elsewhere.
+
+We can now start downloading and compiling the required Boost libraries:
+
+-  Download `Boost`_ (boost\_1\_63\_0.zip)
+
+-  Extract the archive to ``C:\boost``
+
+-  Open a command prompt, and run
+
+   .. code::
+
+       cd C:\boost\boost_1_63_0\tools\build
+       bootstrap.bat gcc
+       b2 install --prefix="C:\boost\boost-build" toolset=gcc
+       cd C:\boost\boost_1_63_0
+
+To build the Boost libraries, you have two choices for command prompt:
+
+-  If you have only one single core, you can use the default
+
+   .. code::
+
+       b2 install --build_dir="C:\boost\boost-build" --prefix="C:\boost\boost-build" toolset=gcc --with=filesystem,system threading=multi --layout=system release
+
+-  If you want to do a multithreaded library building (faster), add ``-j N`` by replacing N by the number of cores/threads you have.
+   For instance, for 2 cores, you would do
+
+   .. code::
+
+       b2 install --build_dir="C:\boost\boost-build" --prefix="C:\boost\boost-build" toolset=gcc --with=filesystem,system threading=multi --layout=system release -j 2
+
+Ignore all the errors popping up, like Python, etc., they do not matter for us.
+
+Your folder should look like this at the end (not fully detailed):
+
+::
+
+    - C
+      |--- boost
+      |------ boost_1_63_0
+      |--------- some folders and files
+      |------ boost-build
+      |--------- bin
+      |--------- include
+      |------------ boost
+      |--------- lib
+      |--------- share
+
+This is what you should (approximately) get at the end of Boost compilation:
+
+.. image:: ./_static/images/screenshot-boost-compiled.png
+   :align: center
+
+If you are getting an error:
+
+-  Wipe your boost directory
+
+-  Close the command prompt
+
+-  Make sure you added
+   ``C:\boost\boost-build\bin``, ``C:\boost\boost-build\include\boost`` to
+   your PATH (adjust accordingly if you use another folder)
+
+-  Do the boost compilation steps again (extract => command prompt => ``cd`` => ``bootstrap`` => ``b2`` => ``cd`` => ``b2``
+
+--------------
+
+Git Installation
+----------------
+
+Installing Git for Windows is straightforward, use the following `link`_.
+
+.. image:: ./_static/images/screenshot-git-for-windows.png
+   :align: center
+
+Then, click on the big Download button, you can't miss it.
+
+Now, we can fetch LightGBM repository for GitHub. Run Git Bash and the following command:
+
+::
+
+    cd C:/
+    mkdir github_repos
+    cd github_repos
+    git clone --recursive https://github.com/Microsoft/LightGBM
+
+Your LightGBM repository copy should now be under ``C:\github_repos\LightGBM``.
+You are free to use any folder you want, but you have to adapt.
+
+Keep Git Bash open.
+
+--------------
+
+CMake Installation, Configuration, Generation
+---------------------------------------------
+
+**CLI / Python users only**
+
+Installing CMake requires one download first and then a lot of configuration for LightGBM:
+
+.. image:: ./_static/images/screenshot-downloading-cmake.png
+   :align: center
+
+-  Download `CMake`_ 3.8.0
+
+-  Install CMake
+
+-  Run cmake-gui
+
+-  Select the folder where you put LightGBM for ``Where is the source code``,
+   default using our steps would be ``C:/github_repos/LightGBM``
+
+-  Copy the folder name, and add ``/build`` for "Where to build the binaries",
+   default using our steps would be ``C:/github_repos/LightGBM/build``
+
+-  Click ``Configure``
+
+   .. image:: ./_static/images/screenshot-create-directory.png
+      :align: center
+
+   .. image:: ./_static/images/screenshot-mingw-makefiles-to-use.png
+      :align: center
+
+-  Lookup for ``USE_GPU`` and check the checkbox
+
+   .. image:: ./_static/images/screenshot-use-gpu.png
+      :align: center
+
+-  Click ``Configure``
+
+   You should get (approximately) the following after clicking Configure:
+
+   .. image:: ./_static/images/screenshot-configured-lightgbm.png
+      :align: center
+
+   ::
+
+       Looking for CL_VERSION_2_0
+       Looking for CL_VERSION_2_0 - found
+       Found OpenCL: C:/Windows/System32/OpenCL.dll (found version "2.0") 
+       OpenCL include directory:C:/Program Files (x86)/AMD APP SDK/3.0/include
+       Boost version: 1.63.0
+       Found the following Boost libraries:
+         filesystem
+         system
+       Configuring done
+
+-  Click ``Generate`` to get the following message:
+
+   ::
+
+       Generating done
+
+This is straightforward, as CMake is providing a large help into locating the correct elements.
+
+--------------
+
+LightGBM Compilation (CLI: final step)
+--------------------------------------
+
+Installation in CLI
+~~~~~~~~~~~~~~~~~~~
+
+**CLI / Python users**
+
+Creating LightGBM libraries is very simple as all the important and hard steps were done before.
+
+You can do everything in the Git Bash console you left open:
+
+-  If you closed Git Bash console previously, run this to get back to the build folder:
+
+   ::
+
+       cd C:/github_repos/LightGBM/build
+
+-  If you did not close the Git Bash console previously, run this to get to the build folder:
+
+   ::
+
+       cd LightGBM/build
+
+-  Setup MinGW as ``make`` using
+
+   ::
+
+       alias make='mingw32-make'
+
+   otherwise, beware error and name clash!
+
+-  In Git Bash, run ``make`` and see LightGBM being installing!
+
+.. image:: ./_static/images/screenshot-lightgbm-with-gpu-support-compiled.png
+   :align: center
+
+If everything was done correctly, you now compiled CLI LightGBM with GPU support!
+
+Testing in CLI
+~~~~~~~~~~~~~~
+
+You can now test LightGBM directly in CLI in a **command prompt** (not Git Bash):
+
+::
+
+    cd C:/github_repos/LightGBM/examples/binary_classification
+    "../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
+
+.. image:: ./_static/images/screenshot-lightgbm-in-cli-with-gpu.png
+   :align: center
+
+Congratulations for reaching this stage!
+
+To learn how to target a correct CPU or GPU for training, please see: `GPU SDK Correspondence and Device Targeting Table <./GPU-Targets.rst>`__.
+
+--------------
+
+Debugging LightGBM Crashes in CLI
+---------------------------------
+
+Now that you compiled LightGBM, you try it... and you always see a segmentation fault or an undocumented crash with GPU support:
+
+.. image:: ./_static/images/screenshot-segmentation-fault.png
+   :align: center
+
+Please check you are using the right device and whether it works with the default ``gpu_device_id = 0`` and ``gpu_platform_id = 0``.
+If it still does not work with the default values, then you should follow all the steps below.
+
+You will have to redo the compilation steps for LightGBM to add debugging mode. This involves:
+
+-  Deleting ``C:/github_repos/LightGBM/build`` folder
+
+-  Deleting ``lightgbm.exe``, ``lib_lightgbm.dll``, and ``lib_lightgbm.dll.a`` files
+
+.. image:: ./_static/images/screenshot-files-to-remove.png
+   :align: center
+
+Once you removed the file, go into CMake, and follow the usual steps.
+Before clicking "Generate", click on "Add Entry":
+
+.. image:: ./_static/images/screenshot-added-manual-entry-in-cmake.png
+   :align: center
+
+In addition, click on Configure and Generate:
+
+.. image:: ./_static/images/screenshot-configured-and-generated-cmake.png
+   :align: center
+
+And then, follow the regular LightGBM CLI installation from there.
+
+Once you have installed LightGBM CLI, assuming your LightGBM is in ``C:\github_repos\LightGBM``,
+open a command prompt and run the following:
+
+::
+
+    gdb --args "../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
+
+.. image:: ./_static/images/screenshot-debug-run.png
+   :align: center
+
+Type ``run`` and press the Enter key.
+
+You will probably get something similar to this:
+
+::
+
+    [LightGBM] [Info] This is the GPU trainer!!
+    [LightGBM] [Info] Total Bins 6143
+    [LightGBM] [Info] Number of data: 7000, number of used features: 28
+    [New Thread 105220.0x1a62c]
+    [LightGBM] [Info] Using GPU Device: Oland, Vendor: Advanced Micro Devices, Inc.
+    [LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
+
+    Program received signal SIGSEGV, Segmentation fault.
+    0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
+    (gdb) 
+
+There, write ``backtrace`` and press the Enter key as many times as gdb requests two choices:
+
+::
+
+    Program received signal SIGSEGV, Segmentation fault.
+    0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
+    (gdb) backtrace
+    #0  0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
+    #1  0x000000000048bbe5 in std::char_traits<char>::length (__s=0x0)
+        at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/char_traits.h:267
+    #2  std::operator+<char, std::char_traits<char>, std::allocator<char> > (__rhs="\\", __lhs=0x0)
+        at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/basic_string.tcc:1157
+    #3  boost::compute::detail::appdata_path[abi:cxx11]() () at C:/boost/boost-build/include/boost/compute/detail/path.hpp:38
+    #4  0x000000000048eec3 in boost::compute::detail::program_binary_path (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", create=create@entry=false)
+        at C:/boost/boost-build/include/boost/compute/detail/path.hpp:46
+    #5  0x00000000004913de in boost::compute::program::load_program_binary (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", ctx=...)
+        at C:/boost/boost-build/include/boost/compute/program.hpp:605
+    #6  0x0000000000490ece in boost::compute::program::build_with_source (
+        source="\n#ifndef _HISTOGRAM_256_KERNEL_\n#define _HISTOGRAM_256_KERNEL_\n\n#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n#pragma OPENC
+    L EXTENSION cl_khr_global_int32_base_atomics : enable\n\n//"..., context=...,
+        options=" -D POWER_FEATURE_WORKGROUPS=5 -D USE_CONSTANT_BUF=0 -D USE_DP_FLOAT=0 -D CONST_HESSIAN=0 -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros -c
+    l-fast-relaxed-math") at C:/boost/boost-build/include/boost/compute/program.hpp:549
+    #7  0x0000000000454339 in LightGBM::GPUTreeLearner::BuildGPUKernels () at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:583
+    #8  0x00000000636044f2 in libgomp-1!GOMP_parallel () from C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin\libgomp-1.dll
+    #9  0x0000000000455e7e in LightGBM::GPUTreeLearner::BuildGPUKernels (this=this@entry=0x3b9cac0)
+        at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:569
+    #10 0x0000000000457b49 in LightGBM::GPUTreeLearner::InitGPU (this=0x3b9cac0, platform_id=<optimized out>, device_id=<optimized out>)
+        at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:720
+    #11 0x0000000000410395 in LightGBM::GBDT::ResetTrainingData (this=0x1f26c90, config=<optimized out>, train_data=0x1f28180, objective_function=0x1f280e0,
+        training_metrics=std::vector of length 2, capacity 2 = {...}) at C:\LightGBM\src\boosting\gbdt.cpp:98
+    #12 0x0000000000402e93 in LightGBM::Application::InitTrain (this=this@entry=0x23f9d0) at C:\LightGBM\src\application\application.cpp:213
+    ---Type <return> to continue, or q <return> to quit---
+    #13 0x00000000004f0b55 in LightGBM::Application::Run (this=0x23f9d0) at C:/LightGBM/include/LightGBM/application.h:84
+    #14 main (argc=6, argv=0x1f21e90) at C:\LightGBM\src\main.cpp:7
+
+Right-click the command prompt, click "Mark", and select all the text from the first line (with the command prompt containing gdb) to the last line printed, containing all the log, such as:
+
+::
+
+    C:\LightGBM\examples\binary_classification>gdb --args "../../lightgbm.exe" config=train.conf data=binary.train valid=binary.test objective=binary device=gpu
+    GNU gdb (GDB) 7.10.1
+    Copyright (C) 2015 Free Software Foundation, Inc.
+    License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+    This is free software: you are free to change and redistribute it.
+    There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
+    and "show warranty" for details.
+    This GDB was configured as "x86_64-w64-mingw32".
+    Type "show configuration" for configuration details.
+    For bug reporting instructions, please see:
+    <http://www.gnu.org/software/gdb/bugs/>.
+    Find the GDB manual and other documentation resources online at:
+    <http://www.gnu.org/software/gdb/documentation/>.
+    For help, type "help".
+    Type "apropos word" to search for commands related to "word"...
+    Reading symbols from ../../lightgbm.exe...done.
+    (gdb) run
+    Starting program: C:\LightGBM\lightgbm.exe "config=train.conf" "data=binary.train" "valid=binary.test" "objective=binary" "device=gpu"
+    [New Thread 105220.0x199b8]
+    [New Thread 105220.0x783c]
+    [Thread 105220.0x783c exited with code 0]
+    [LightGBM] [Info] Finished loading parameters
+    [New Thread 105220.0x19490]
+    [New Thread 105220.0x1a71c]
+    [New Thread 105220.0x19a24]
+    [New Thread 105220.0x4fb0]
+    [Thread 105220.0x4fb0 exited with code 0]
+    [LightGBM] [Info] Loading weights...
+    [New Thread 105220.0x19988]
+    [Thread 105220.0x19988 exited with code 0]
+    [New Thread 105220.0x1a8fc]
+    [Thread 105220.0x1a8fc exited with code 0]
+    [LightGBM] [Info] Loading weights...
+    [New Thread 105220.0x1a90c]
+    [Thread 105220.0x1a90c exited with code 0]
+    [LightGBM] [Info] Finished loading data in 1.011408 seconds
+    [LightGBM] [Info] Number of positive: 3716, number of negative: 3284
+    [LightGBM] [Info] This is the GPU trainer!!
+    [LightGBM] [Info] Total Bins 6143
+    [LightGBM] [Info] Number of data: 7000, number of used features: 28
+    [New Thread 105220.0x1a62c]
+    [LightGBM] [Info] Using GPU Device: Oland, Vendor: Advanced Micro Devices, Inc.
+    [LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
+    
+    Program received signal SIGSEGV, Segmentation fault.
+    0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
+    (gdb) backtrace
+    #0  0x00007ffbb37c11f1 in strlen () from C:\Windows\system32\msvcrt.dll
+    #1  0x000000000048bbe5 in std::char_traits<char>::length (__s=0x0)
+        at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/char_traits.h:267
+    #2  std::operator+<char, std::char_traits<char>, std::allocator<char> > (__rhs="\\", __lhs=0x0)
+        at C:/PROGRA~1/MINGW-~1/X86_64~1.0-P/mingw64/x86_64-w64-mingw32/include/c++/bits/basic_string.tcc:1157
+    #3  boost::compute::detail::appdata_path[abi:cxx11]() () at C:/boost/boost-build/include/boost/compute/detail/path.hpp:38
+    #4  0x000000000048eec3 in boost::compute::detail::program_binary_path (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", create=create@entry=false)
+        at C:/boost/boost-build/include/boost/compute/detail/path.hpp:46
+    #5  0x00000000004913de in boost::compute::program::load_program_binary (hash="d27987d5bd61e2d28cd32b8d7a7916126354dc81", ctx=...)
+        at C:/boost/boost-build/include/boost/compute/program.hpp:605
+    #6  0x0000000000490ece in boost::compute::program::build_with_source (
+        source="\n#ifndef _HISTOGRAM_256_KERNEL_\n#define _HISTOGRAM_256_KERNEL_\n\n#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n\n//"..., context=...,
+        options=" -D POWER_FEATURE_WORKGROUPS=5 -D USE_CONSTANT_BUF=0 -D USE_DP_FLOAT=0 -D CONST_HESSIAN=0 -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math") at C:/boost/boost-build/include/boost/compute/program.hpp:549
+    #7  0x0000000000454339 in LightGBM::GPUTreeLearner::BuildGPUKernels () at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:583
+    #8  0x00000000636044f2 in libgomp-1!GOMP_parallel () from C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin\libgomp-1.dll
+    #9  0x0000000000455e7e in LightGBM::GPUTreeLearner::BuildGPUKernels (this=this@entry=0x3b9cac0)
+        at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:569
+    #10 0x0000000000457b49 in LightGBM::GPUTreeLearner::InitGPU (this=0x3b9cac0, platform_id=<optimized out>, device_id=<optimized out>)
+        at C:\LightGBM\src\treelearner\gpu_tree_learner.cpp:720
+    #11 0x0000000000410395 in LightGBM::GBDT::ResetTrainingData (this=0x1f26c90, config=<optimized out>, train_data=0x1f28180, objective_function=0x1f280e0,
+        training_metrics=std::vector of length 2, capacity 2 = {...}) at C:\LightGBM\src\boosting\gbdt.cpp:98
+    #12 0x0000000000402e93 in LightGBM::Application::InitTrain (this=this@entry=0x23f9d0) at C:\LightGBM\src\application\application.cpp:213
+    ---Type <return> to continue, or q <return> to quit---
+    #13 0x00000000004f0b55 in LightGBM::Application::Run (this=0x23f9d0) at C:/LightGBM/include/LightGBM/application.h:84
+    #14 main (argc=6, argv=0x1f21e90) at C:\LightGBM\src\main.cpp:7
+
+And open an issue in GitHub `here`_ with that log.
+
+.. _Intel SDK for OpenCL: https://software.intel.com/en-us/articles/opencl-drivers
+
+.. _AMD APP SDK: http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/
+
+.. _CUDA Toolkit: https://developer.nvidia.com/cuda-downloads
+
+.. _this: http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe
+
+.. _Boost: http://www.boost.org/users/history/version_1_63_0.html
+
+.. _link: https://git-for-windows.github.io/
+
+.. _CMake: https://cmake.org/download/
+
+.. _here: https://github.com/Microsoft/LightGBM/issues
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
index ab2def8dc..67ab9702d 100644
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@@ -31,7 +31,7 @@ The exe file will be in ``LightGBM-master/windows/x64/Release`` folder.
 From Command Line
 *****************
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (MSbuild is not needed if **Visual Studio** is installed).
+1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (**MSBuild** is not needed if **Visual Studio** is installed).
 
 2. Run the following commands:
 
@@ -66,10 +66,12 @@ The exe and dll files will be in ``LightGBM/`` folder.
 
 **Note**: you may need to run the ``cmake -G "MinGW Makefiles" ..`` one more time if met ``sh.exe was found in your PATH`` error.
 
+Also you may want to reed `gcc Tips <./gcc-Tips.rst>`__.
+
 Linux
 ~~~~~
 
-LightGBM uses ``CMake`` to build. Run the following commands:
+LightGBM uses **CMake** to build. Run the following commands:
 
 .. code::
 
@@ -80,6 +82,8 @@ LightGBM uses ``CMake`` to build. Run the following commands:
 
 **Note**: glibc >= 2.14 is required.
 
+Also you may want to reed `gcc Tips <./gcc-Tips.rst>`__.
+
 OSX
 ~~~
 
@@ -102,6 +106,8 @@ Then install LightGBM:
   cmake ..
   make -j4
 
+Also you may want to reed `gcc Tips <./gcc-Tips.rst>`__.
+
 Docker
 ~~~~~~
 
@@ -129,7 +135,7 @@ With GUI
 
 4. Go to ``LightGBM-master/windows`` folder.
 
-4. Open ``LightGBM.sln`` file with Visual Studio, choose ``Release_mpi`` configuration and click ``BUILD-> Build Solution (Ctrl+Shift+B)``.
+5. Open ``LightGBM.sln`` file with Visual Studio, choose ``Release_mpi`` configuration and click ``BUILD-> Build Solution (Ctrl+Shift+B)``.
 
    If you have errors about **Platform Toolset**, go to ``PROJECT-> Properties-> Configuration Properties-> General`` and select the toolset installed on your machine.
 
@@ -140,7 +146,7 @@ From Command Line
 
 1. You need to install `MS MPI`_ first. Both ``msmpisdk.msi`` and ``MSMpiSetup.exe`` are needed.
 
-2. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (MSbuild is not needed if **Visual Studio** is installed).
+2. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (MSBuild is not needed if **Visual Studio** is installed).
 
 3. Run the following commands:
 
@@ -226,11 +232,11 @@ To build LightGBM GPU version, run the following commands:
 Windows
 ^^^^^^^
 
-If you use **MinGW**, the build procedure are similar to the build in Linux. Refer to `GPU Windows Compilation <./GPU-Windows.md>`__ to get more details.
+If you use **MinGW**, the build procedure are similar to the build in Linux. Refer to `GPU Windows Compilation <./GPU-Windows.rst>`__ to get more details.
 
-Following procedure is for the MSVC(Microsoft Visual C++) build.
+Following procedure is for the MSVC (Microsoft Visual C++) build.
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (MSbuild is not needed if **Visual Studio** is installed).
+1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `MSBuild`_ (MSBuild is not needed if **Visual Studio** is installed).
 
 2. Install **OpenCL** for Windows. The installation depends on the brand (NVIDIA, AMD, Intel) of your GPU card.
 
diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst
index 3d88b038a..1a5a89543 100644
--- a/docs/Parallel-Learning-Guide.rst
+++ b/docs/Parallel-Learning-Guide.rst
@@ -3,7 +3,7 @@ Parallel Learning Guide
 
 This is a guide for parallel learning of LightGBM.
 
-Follow the `Quick Start`_ to know how to use LightGBM first.
+Follow the `Quick Start <./Quick-Start.rst>`__ to know how to use LightGBM first.
 
 Choose Appropriate Parallel Algorithm
 -------------------------------------
@@ -30,14 +30,14 @@ These algorithms are suited for different scenarios, which is listed in the foll
 | **#feature is large**   | Feature Parallel     | Voting Parallel      |
 +-------------------------+----------------------+----------------------+
 
-More details about these parallel algorithms can be found in `optimization in parallel learning`_.
+More details about these parallel algorithms can be found in `optimization in parallel learning <./Features.rst#optimization-in-parallel-learning>`__.
 
 Build Parallel Version
 ----------------------
 
 Default build version support parallel learning based on the socket.
 
-If you need to build parallel version with MPI support, please refer to `Installation Guide`_.
+If you need to build parallel version with MPI support, please refer to `Installation Guide <./Installation-Guide.rst#build-mpi-version>`__.
 
 Preparation
 -----------
@@ -64,7 +64,7 @@ Then write these IP in one file (assume ``mlist.txt``) like following:
     machine1_ip
     machine2_ip
 
-Note: For Windows users, need to start "smpd" to start MPI service. More details can be found `here`_.
+**Note**: For Windows users, need to start "smpd" to start MPI service. More details can be found `here`_.
 
 Run Parallel Learning
 ---------------------
@@ -74,49 +74,53 @@ Socket Version
 
 1. Edit following parameters in config file:
 
-``tree_learner=your_parallel_algorithm``, edit ``your_parallel_algorithm`` (e.g. feature/data) here.
+   ``tree_learner=your_parallel_algorithm``, edit ``your_parallel_algorithm`` (e.g. feature/data) here.
 
-``num_machines=your_num_machines``, edit ``your_num_machines`` (e.g. 4) here.
+   ``num_machines=your_num_machines``, edit ``your_num_machines`` (e.g. 4) here.
 
-``machine_list_file=mlist.txt``, ``mlist.txt`` is created in `Preparation section <#preparation>`__.
+   ``machine_list_file=mlist.txt``, ``mlist.txt`` is created in `Preparation section <#preparation>`__.
 
-``local_listen_port=12345``, ``12345`` is allocated in `Preparation section <#preparation>`__.
+   ``local_listen_port=12345``, ``12345`` is allocated in `Preparation section <#preparation>`__.
 
 2. Copy data file, executable file, config file and ``mlist.txt`` to all machines.
 
 3. Run following command on all machines, you need to change ``your_config_file`` to real config file.
 
-For Windows: ``lightgbm.exe config=your_config_file``
+   For Windows: ``lightgbm.exe config=your_config_file``
 
-For Linux: ``./lightgbm config=your_config_file``
+   For Linux: ``./lightgbm config=your_config_file``
 
 MPI Version
 ^^^^^^^^^^^
 
 1. Edit following parameters in config file:
 
-``tree_learner=your_parallel_algorithm``, edit ``your_parallel_algorithm`` (e.g. feature/data) here.
+   ``tree_learner=your_parallel_algorithm``, edit ``your_parallel_algorithm`` (e.g. feature/data) here.
 
-``num_machines=your_num_machines``, edit ``your_num_machines`` (e.g. 4) here.
+   ``num_machines=your_num_machines``, edit ``your_num_machines`` (e.g. 4) here.
 
-2. Copy data file, executable file, config file and ``mlist.txt`` to all machines. Note: MPI needs to be run in the **same path on all machines**.
+2. Copy data file, executable file, config file and ``mlist.txt`` to all machines.
+
+   **Note**: MPI needs to be run in the **same path on all machines**.
 
 3. Run following command on one machine (not need to run on all machines), need to change ``your_config_file`` to real config file.
 
-For Windows: ``mpiexec.exe /machinefile mlist.txt lightgbm.exe config=your_config_file``
+   For Windows:
+   
+   .. code::
 
-For Linux: ``mpiexec --machinefile mlist.txt ./lightgbm config=your_config_file``
+       mpiexec.exe /machinefile mlist.txt lightgbm.exe config=your_config_file
+
+   For Linux:
+
+   .. code::
+
+       mpiexec --machinefile mlist.txt ./lightgbm config=your_config_file
 
 Example
 ^^^^^^^
 
--  `A simple parallel example`_.
-
-.. _Quick Start: ./Quick-Start.md
-
-.. _optimization in parallel learning: ./Features.md
-
-.. _Installation Guide: ./Installation-Guide.rst
+-  `A simple parallel example`_
 
 .. _here: https://blogs.technet.microsoft.com/windowshpc/2015/02/02/how-to-compile-and-run-a-simple-ms-mpi-program/
 
diff --git a/docs/Parameters-Tuning.rst b/docs/Parameters-Tuning.rst
new file mode 100644
index 000000000..57b419a57
--- /dev/null
+++ b/docs/Parameters-Tuning.rst
@@ -0,0 +1,80 @@
+Parameters Tuning
+=================
+
+This is a page contains all parameters in LightGBM.
+
+**List of other helpful links**
+
+-  `Parameters <./Parameters.rst>`__
+-  `Python API <./Python-API.rst>`__
+
+Tune Parameters for the Leaf-wise (Best-first) Tree
+---------------------------------------------------
+
+LightGBM uses the `leaf-wise <./Features.rst#leaf-wise-best-first-tree-growth>`__ tree growth algorithm, while many other popular tools use depth-wise tree growth.
+Compared with depth-wise growth, the leaf-wise algorithm can convenge much faster.
+However, the leaf-wise growth may be over-fitting if not used with the appropriate parameters.
+
+To get good results using a leaf-wise tree, these are some important parameters:
+
+1. ``num_leaves``. This is the main parameter to control the complexity of the tree model.
+   Theoretically, we can set ``num_leaves = 2^(max_depth)`` to convert from depth-wise tree.
+   However, this simple conversion is not good in practice.
+   The reason is, when number of leaves are the same, the leaf-wise tree is much deeper than depth-wise tree. As a result, it may be over-fitting.
+   Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``.
+   For example, when the ``max_depth=6`` the depth-wise tree can get good accuracy,
+   but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise.
+   Actually, the concept ``depth`` can be forgotten in leaf-wise tree, since it doesn't have a correct mapping from ``leaves`` to ``depth``.
+
+2. ``min_data_in_leaf``. This is a very important parameter to deal with over-fitting in leaf-wise tree.
+   Its value depends on the number of training data and ``num_leaves``.
+   Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting.
+   In practice, setting it to hundreds or thousands is enough for a large dataset.
+
+3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly.
+
+For Faster Speed
+----------------
+
+-  Use bagging by setting ``bagging_fraction`` and ``bagging_freq``
+
+-  Use feature sub-sampling by setting ``feature_fraction``
+
+-  Use small ``max_bin``
+
+-  Use ``save_binary`` to speed up data loading in future learning
+
+-  Use parallel learning, refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__
+
+
+For Better Accuracy
+-------------------
+
+-  Use large ``max_bin`` (may be slower)
+
+-  Use small ``learning_rate`` with large ``num_iterations``
+
+-  Use large ``num_leaves`` (may cause over-fitting)
+
+-  Use bigger training data
+
+-  Try ``dart``
+
+Deal with Over-fitting
+----------------------
+
+-  Use small ``max_bin``
+
+-  Use small ``num_leaves``
+
+-  Use ``min_data_in_leaf`` and ``min_sum_hessian_in_leaf``
+
+-  Use bagging by set ``bagging_fraction`` and ``bagging_freq``
+
+-  Use feature sub-sampling by set ``feature_fraction``
+
+-  Use bigger training data
+
+-  Try ``lambda_l1``, ``lambda_l2`` and ``min_gain_to_split`` for regularization
+
+-  Try ``max_depth`` to avoid growing deep tree
diff --git a/docs/Parameters-tuning.md b/docs/Parameters-tuning.md
deleted file mode 100644
index ab08f26bd..000000000
--- a/docs/Parameters-tuning.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# Parameters Tuning
-
-This is a page contains all parameters in LightGBM.
-
-***List of other Helpful Links***
-* [Parameters](./Parameters.md)
-* [Python API](./Python-API.rst)
-
-## Tune Parameters for the Leaf-wise (Best-first) Tree
-
-LightGBM uses the [leaf-wise](./Features.md) tree growth algorithm, while many other popular tools use depth-wise tree growth. Compared with depth-wise growth, the leaf-wise algorithm can convenge much faster. However, the leaf-wise growth may be over-fitting if not used with the appropriate parameters. 
-
-To get good results using a leaf-wise tree, these are some important parameters:
-
-1. ```num_leaves```. This is the main parameter to control the complexity of the tree model. Theoretically, we can set ```num_leaves = 2^(max_depth) ``` to convert from depth-wise tree. However, this simple conversion is not good in practice. The reason is, when number of leaves are the same, the leaf-wise tree is much deeper than depth-wise tree. As a result, it may be over-fitting. Thus, when trying to tune the ```num_leaves```, we should let it be smaller than ```2^(max_depth)```. For example, when the ```max_depth=6``` the depth-wise tree can get good accuracy, but setting ```num_leaves``` to ```127``` may cause over-fitting, and setting it to ```70``` or ```80``` may get better accuracy than depth-wise. Actually, the concept ```depth``` can be forgotten in leaf-wise tree, since it doesn't have a correct mapping from ```leaves``` to ```depth```. 
-
-2. ```min_data_in_leaf```. This is a very important parameter to deal with over-fitting in leaf-wise tree. Its value depends on the number of training data and ```num_leaves```. Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset. 
-
-3. ```max_depth```. You also can use ```max_depth``` to limit the tree depth explicitly. 
-
-
-## For Faster Speed
-
-* Use bagging by setting ```bagging_fraction``` and ```bagging_freq``` 
-* Use feature sub-sampling by setting ```feature_fraction```
-* Use small ```max_bin```
-* Use ```save_binary``` to speed up data loading in future learning
-* Use parallel learning, refer to [Parallel Learning Guide](./Parallel-Learning-Guide.rst).
-
-## For Better Accuracy
-
-* Use large ```max_bin``` (may be slower)
-* Use small ```learning_rate``` with large ```num_iterations```
-* Use large ```num_leaves```(may cause over-fitting)
-* Use bigger training data
-* Try ```dart```
-
-## Deal with Over-fitting
-
-* Use small ```max_bin```
-* Use small ```num_leaves```
-* Use ```min_data_in_leaf``` and ```min_sum_hessian_in_leaf```
-* Use bagging by set ```bagging_fraction``` and ```bagging_freq``` 
-* Use feature sub-sampling by set ```feature_fraction```
-* Use bigger training data
-* Try ```lambda_l1```, ```lambda_l2``` and ```min_gain_to_split``` to regularization
-* Try ```max_depth``` to avoid growing deep tree 
diff --git a/docs/Parameters.md b/docs/Parameters.md
deleted file mode 100644
index 00f3ccb81..000000000
--- a/docs/Parameters.md
+++ /dev/null
@@ -1,374 +0,0 @@
-# Parameters
-
-This is a page contains all parameters in LightGBM.
-
-***List of other Helpful Links***
-* [Python API](./Python-API.rst)
-* [Parameters Tuning](./Parameters-tuning.md)
-
-***External Links***
-* [Laurae++ Interactive Documentation](https://sites.google.com/view/lauraepp/parameters)
-
-***Update of 08/04/2017***
-
-Default values for the following parameters have changed:
-
-* min_data_in_leaf = 100 => 20
-* min_sum_hessian_in_leaf = 10 => 1e-3
-* num_leaves = 127 => 31
-* num_iterations = 10 => 100
-
-## Parameter Format
-
-The parameter format is `key1=value1 key2=value2 ... ` . And parameters can be set both in config file and command line. By using command line, parameters should not have spaces before and after `=`. By using config files, one line can only contain one parameter. you can use `#` to comment. If one parameter appears in both command line and config file, LightGBM will use the parameter in command line.
-
-## Core Parameters
-
-* `config`, default=`""`, type=string, alias=`config_file`
-  * path of config file
-* `task`, default=`train`, type=enum, options=`train`,`prediction`
-  * `train` for training
-  * `prediction` for prediction.
-  * `convert_model` for converting model file into if-else format, see more information in [Convert model parameters](#convert-model-parameters)
-* `application`, default=`regression`, type=enum, options=`regression`,`regression_l1`,`huber`,`fair`,`poisson`,`binary`,`lambdarank`,`multiclass`, alias=`objective`,`app`
-  * `regression`, regression application
-    * `regression_l2`, L2 loss, alias=`mean_squared_error`,`mse`
-    * `regression_l1`, L1 loss, alias=`mean_absolute_error`,`mae`
-    * `huber`, [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia")
-    * `fair`, [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)
-    * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
-  * `binary`, binary classification application
-  * `lambdarank`, [lambdarank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf) application
-    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
-    * `label_gain` can be used to set the gain(weight) of `int` label.
-  * `multiclass`, multi-class classification application, should set `num_class` as well
-* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
-  * `gbdt`, traditional Gradient Boosting Decision Tree
-  * `rf`, Random Forest
-  * `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
-  * `goss`, Gradient-based One-Side Sampling
-* `data`, default=`""`, type=string, alias=`train`,`train_data`
-  * training data, LightGBM will train from this data
-* `valid`, default=`""`, type=multi-string, alias=`test`,`valid_data`,`test_data`
-  * validation/test data, LightGBM will output metrics for these data
-  * support multi validation data, separate by `,`
-* `num_iterations`, default=`100`, type=int, alias=`num_iteration`,`num_tree`,`num_trees`,`num_round`,`num_rounds`
-  * number of boosting iterations
-  * note: For python/R package, **this parameter is ignored**, use `num_boost_round` (Python) or `nrounds` (R) input arguments of `train` and `cv` methods instead
-  * note: internally, LightGBM constructs `num_class * num_iterations` trees for `multiclass` problems
-* `learning_rate`, default=`0.1`, type=double, alias=`shrinkage_rate`
-  * shrinkage rate
-  * in `dart`, it also affects normalization weights of dropped trees
-* `num_leaves`, default=`31`, type=int, alias=`num_leaf`
-  * number of leaves in one tree
-* `tree_learner`, default=`serial`, type=enum, options=`serial`,`feature`,`data`
-  * `serial`, single machine tree learner
-  * `feature`, feature parallel tree learner
-  * `data`, data parallel tree learner
-  * Refer to [Parallel Learning Guide](./Parallel-Learning-Guide.rst) to get more details.
-* `num_threads`, default=OpenMP_default, type=int, alias=`num_thread`,`nthread`
-  * Number of threads for LightGBM.
-  * For the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPU using [hyper-threading](https://en.wikipedia.org/wiki/Hyper-threading) to generate 2 threads per CPU core).
-  * Do not set it too large if your dataset is small (do not use 64 threads for a dataset with 10,000 for instance).
-  * Be aware a task manager or any similar CPU monitoring tool might report cores not being fully utilized. This is normal.
-  * For parallel learning, should not use full CPU cores since this will cause poor performance for the network.
-* `device`, default=`cpu`, options=`cpu`,`gpu`
-  * Choose device for the tree learning, can use gpu to achieve the faster learning.
-  * Note: 1. Recommend use the smaller `max_bin`(e.g `63`) to get the better speed up. 2. For the faster speed, GPU use 32-bit float point to sum up by default, may affect the accuracy for some tasks. You can set `gpu_use_dp=true` to enable 64-bit float point, but it will slow down the training. 3. Refer to [Installation Guide](./Installation-Guide.rst) to build with GPU .
-
-
-## Learning Control Parameters
-
-* `max_depth`, default=`-1`, type=int
-  * Limit the max depth for tree model. This is used to deal with overfit when #data is small. Tree still grow by leaf-wise.
-  * `< 0` means no limit
-* `min_data_in_leaf`, default=`20`, type=int, alias=`min_data_per_leaf` , `min_data`
-  * Minimal number of data in one leaf. Can use this to deal with over-fit.
-* `min_sum_hessian_in_leaf`, default=`1e-3`, type=double, alias=`min_sum_hessian_per_leaf`, `min_sum_hessian`, `min_hessian`
-  * Minimal sum hessian in one leaf. Like `min_data_in_leaf`, can use this to deal with over-fit.
-* `feature_fraction`, default=`1.0`, type=double, `0.0 < feature_fraction < 1.0`, alias=`sub_feature`
-  * LightGBM will random select part of features on each iteration if `feature_fraction` smaller than `1.0`. For example, if set to `0.8`, will select 80% features before training each tree.
-  * Can use this to speed up training
-  * Can use this to deal with over-fit
-* `feature_fraction_seed`, default=`2`, type=int
-  * Random seed for feature fraction.
-* `bagging_fraction`, default=`1.0`, type=double, , `0.0 < bagging_fraction < 1.0`, alias=`sub_row`
-  * Like `feature_fraction`, but this will random select part of data without resampling
-  * Can use this to speed up training
-  * Can use this to deal with over-fit
-  * Note: To enable bagging, should set `bagging_freq` to a non zero value as well
-* `bagging_freq`, default=`0`, type=int
-  * Frequency for bagging, `0` means disable bagging. `k` means will perform bagging at every `k` iteration.
-  * Note: To enable bagging, should set `bagging_fraction` as well
-* `bagging_seed` , default=`3`, type=int
-  * Random seed for bagging.
-* `early_stopping_round` , default=`0`, type=int, alias=`early_stopping_rounds`,`early_stopping`
-  * Will stop training if one metric of one validation data doesn't improve in last `early_stopping_round` rounds.
-* `lambda_l1` , default=`0`, type=double
-  * l1 regularization
-* `lambda_l2` , default=`0`, type=double
-  * l2 regularization
-* `min_gain_to_split` , default=`0`, type=double
-  * The minimal gain to perform split
-* `drop_rate`, default=`0.1`, type=double
-  * only used in `dart`
-* `skip_drop`, default=`0.5`, type=double
-  * only used in `dart`, probability of skipping drop
-* `max_drop`, default=`50`, type=int
-  * only used in `dart`, max number of dropped trees on one iteration. `<=0` means no limit.
-* `uniform_drop`, default=`false`, type=bool
-  * only used in `dart`, true if want to use uniform drop
-* `xgboost_dart_mode`, default=`false`, type=bool
-  * only used in `dart`, true if want to use xgboost dart mode
-* `drop_seed`, default=`4`, type=int
-  * only used in `dart`, used to random seed to choose dropping models.
-* `top_rate`, default=`0.2`, type=double
-  * only used in `goss`,  the retain ratio of large gradient data
-* `other_rate`, default=`0.1`, type=int
-  * only used in `goss`,  the retain ratio of small gradient data
-* `max_cat_group`, default=`64`, type=int
-  * use for the categorical features.
-  * When #catogory is large, finding the split point on it is easily over-fitting. So LightGBM merges them into `max_cat_group` groups, and finds the split points on the group boundaries.
-* `min_data_per_group`, default=`10`, type=int
-  * Min number of data per categorical group.
-* `max_cat_threshold`, default=`256`, type=int
-  * use for the categorical features. Limit the max threshold points in categorical features.
-* `min_cat_smooth`, default=`5`, type=double
-  * use for the categorical features. Refer to the descrption in paramater `cat_smooth_ratio`.
-* `max_cat_smooth`, default=`100`, type=double
-  * use for the categorical features. Refer to the descrption in paramater `cat_smooth_ratio`.
-* `cat_smooth_ratio`, default=`0.01`, type=double
-  * use for the categorical features. This can reduce the effect of noises in categorical features, especially for categories with few data.
-  * The smooth denominator is `a = min(max_cat_smooth, max(min_cat_smooth, num_data/num_category*cat_smooth_ratio))`.
-  * The smooth numerator  is `b = a * sum_gradient / sum_hessian`.
-
-
-## IO Parameters
-
-* `max_bin`, default=`255`, type=int
-  * max number of bin that feature values will bucket in. Small bin may reduce training accuracy but may increase general power (deal with over-fit).
-  * LightGBM will auto compress memory according `max_bin`. For example, LightGBM will use `uint8_t` for feature value if `max_bin=255`.
-* `min_data_in_bin`, default=`5`, type=int
-  * min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting).
-* `data_random_seed`, default=`1`, type=int
-  * random seed for data partition in parallel learning(not include feature parallel).
-* `output_model`, default=`LightGBM_model.txt`, type=string, alias=`model_output`,`model_out`
-  * file name of output model in training.
-* `input_model`, default=`""`, type=string, alias=`model_input`,`model_in`
-  * file name of input model.
-  * for prediction task, will prediction data using this model.
-  * for train task, will continued train from this model.
-* `output_result`, default=`LightGBM_predict_result.txt`, type=string, alias=`predict_result`,`prediction_result`
-  * file name of prediction result in prediction task.
-* `is_pre_partition`, default=`false`, type=bool
-  * used for parallel learning(not include feature parallel).
-  * `true` if training data are pre-partitioned, and different machines using different partition.
-* `is_sparse`, default=`true`, type=bool, alias=`is_enable_sparse`
-  * used to enable/disable sparse optimization. Set to `false` to disable sparse optimization.
-* `two_round`, default=`false`, type=bool, alias=`two_round_loading`,`use_two_round_loading`
-  * by default, LightGBM will map data file to memory and load features from memory. This will provide faster data loading speed. But it may out of memory when the data file is very big.
-  * set this to `true` if data file is too big to fit in memory.
-* `save_binary`, default=`false`, type=bool, alias=`is_save_binary`,`is_save_binary_file`
-  * set this to `true` will save the data set(include validation data) to a binary file. Speed up the data loading speed for the next time.
-* `verbosity`, default=`1`, type=int, alias=`verbose`
-  * `<0` = Fatel, `=0` = Error(Warn), `>0` = Info
-* `header`, default=`false`, type=bool, alias=`has_header`
-  * `true` if input data has header
-* `label`, default=`""`, type=string, alias=`label_column`
-  * specific the label column
-  * Use number for index, e.g. `label=0` means column_0 is the label
-  * Add a prefix `name:` for column name, e.g. `label=name:is_click`
-* `weight`, default=`""`, type=string, alias=`weight_column`
-  * specific the weight column
-  * Use number for index, e.g. `weight=0` means column_0 is the weight
-  * Add a prefix `name:` for column name, e.g. `weight=name:weight`
-  * Note: Index start from `0`. And it doesn't count the label column when passing type is Index. e.g. when label is  column_0, and weight is column_1, the correct parameter is `weight=0`.
-* `query`, default=`""`, type=string, alias=`query_column`,`group`,`group_column`
-  * specific the query/group id column
-  * Use number for index, e.g. `query=0` means column_0 is the query id
-  * Add a prefix `name:` for column name, e.g. `query=name:query_id`
-  * Note: Data should group by query_id. Index start from `0`. And it doesn't count the label column when passing type is Index. e.g. when label is  column_0, and query_id is column_1, the correct parameter is `query=0`.
-* `ignore_column`, default=`""`, type=string, alias=`ignore_feature`,`blacklist`
-  * specific some ignore columns in training
-  * Use number for index, e.g. `ignore_column=0,1,2` means column_0, column_1 and column_2 will be ignored.
-  * Add a prefix `name:` for column name, e.g. `ignore_column=name:c1,c2,c3` means c1, c2 and c3 will be ignored.
-  * Note: Index start from `0`. And it doesn't count the label column.
-* `categorical_feature`, default=`""`, type=string, alias=`categorical_column`,`cat_feature`,`cat_column`
-  * specific categorical features
-  * Use number for index, e.g. `categorical_feature=0,1,2` means column_0, column_1 and column_2 are categorical features.
-  * Add a prefix `name:` for column name, e.g. `categorical_feature=name:c1,c2,c3` means c1, c2 and c3 are categorical features.
-  * Note: Only support categorical with `int` type (Note: the negative values will be treated as Missing values). Index start from `0`. And it doesn't count the label column.
-* `predict_raw_score`, default=`false`, type=bool, alias=`raw_score`,`is_predict_raw_score`
-  * only used in prediction task
-  * Set to `true` will only predict the raw scores.
-  * Set to `false` will transformed score
-* `predict_leaf_index`, default=`false`, type=bool, alias=`leaf_index`,`is_predict_leaf_index`
-  * only used in prediction task
-  * Set to `true` to predict with leaf index of all trees
-* `predict_contrib`, default=`false`, type=bool, alias=`contrib`,`is_predict_contrib`
-  * only used in prediction task
-  * Set to `true` to estimate [SHAP values](https://arxiv.org/abs/1706.06060), which represent how each feature contributed to each prediction. Produces number of features + 1 values where the last value is the expected value of the model output over the training data.
-* `bin_construct_sample_cnt`, default=`200000`, type=int
-  * Number of data that sampled to construct histogram bins.
-  * Will give better training result when set this larger. But will increase data loading time.
-  * Set this to larger value if data is very sparse.
-* `num_iteration_predict`, default=`-1`, type=int
-  * only used in prediction task, used to how many trained iterations will be used in prediction.
-  * `<= 0` means no limit
-* `pred_early_stop`, default=`false`, type=bool
-  * Set to `true` will use early-stopping to speed up the prediction. May affect the accuracy.
-* `pred_early_stop_freq`, default=`10`, type=int
-  * The frequency of checking early-stopping prediction.
-* `pred_early_stop_margin`, default=`10.0`, type=double
-  * The Threshold of margin in early-stopping prediction.
-* `use_missing`, default=`true`, type=bool
-  * Set to `false` will disable the special handle of missing value.
-* `zero_as_missing`, default=`false`, type=bool
-  * Set to `true` will treat all zero as missing values (including the unshown values in libsvm/sparse matrics).
-  * Set to `false` will use `na` to represent missing values.
-* `init_score_file`, default=`""`, type=string
-  * Path of training initial score file, `""` will use `train_data_file+".init"` (if exists).
-* `valid_init_score_file`, default=`""`, type=multi-string
-  * Path of validation initial score file, `""` will use `valid_data_file+".init"` (if exists).
-  * separate by `,` for multi-validation data
-
-
-## Objective Parameters
-
-* `sigmoid`, default=`1.0`, type=double
-  * parameter for sigmoid function. Will be used in binary classification and lambdarank.
-* `huber_delta`, default=`1.0`, type=double
-  * parameter for [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia"). Will be used in regression task.
-* `fair_c`, default=`1.0`, type=double
-  * parameter for [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520). Will be used in regression task.
-* `gaussian_eta`, default=`1.0`, type=double
-  * parameter to control the width of Gaussian function. Will be used in l1 and huber regression loss.
-* `poission_max_delta_step`, default=`0.7`, type=double
-  * parameter used to safeguard optimization
-* `scale_pos_weight`, default=`1.0`, type=double
-  * weight of positive class in binary classification task
-* `boost_from_average`, default=`true`, type=bool
-  * adjust initial score to the mean of labels for faster convergence, only used in Regression task.
-* `is_unbalance`, default=`false`, type=bool
-  * used in binary classification. Set this to `true` if training data are unbalance.
-* `max_position`, default=`20`, type=int
-  * used in lambdarank, will optimize NDCG at this position.
-* `label_gain`, default=`0,1,3,7,15,31,63,...`, type=multi-double
-  * used in lambdarank, relevant gain for labels. For example, the gain of label `2` is `3` if using default label gains.
-  * Separate by `,`
-* `num_class`, default=`1`, type=int, alias=`num_classes`
-  * only used in multi-class classification
-
-
-## Metric Parameters
-
-* `metric`, default={`l2` for regression}, {`binary_logloss` for binary classification},{`ndcg` for lambdarank}, type=multi-enum, options=`l1`,`l2`,`ndcg`,`auc`,`binary_logloss`,`binary_error`...
-  * `l1`, absolute loss, alias=`mean_absolute_error`, `mae`
-  * `l2`, square loss, alias=`mean_squared_error`, `mse`
-  * `l2_root`, root square loss, alias=`root_mean_squared_error`, `rmse`
-  * `huber`, [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia")
-  * `fair`, [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)
-  * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
-  * `ndcg`, [NDCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG)
-  * `map`, [MAP](https://en.wikipedia.org/wiki/Information_retrieval#Mean_average_precision)
-  * `auc`, [AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)
-  * `binary_logloss`, [log loss](https://www.kaggle.com/wiki/LogLoss)
-  * `binary_error`. For one sample `0` for correct classification, `1` for error classification.
-  * `multi_logloss`, log loss for mulit-class classification
-  * `multi_error`. error rate for mulit-class classification
-  * Support multi metrics, separate by `,`
-* `metric_freq`, default=`1`, type=int
-  * frequency for metric output
-* `is_training_metric`, default=`false`, type=bool
-  * set this to true if need to output metric result of training
-* `ndcg_at`, default=`1,2,3,4,5`, type=multi-int, alias=`ndcg_eval_at`,`eval_at`
-  * NDCG evaluation position, separate by `,`
-
-
-## Network Parameters
-
-Following parameters are used for parallel learning, and only used for base(socket) version.
-
-* `num_machines`, default=`1`, type=int, alias=`num_machine`
-  * Used for parallel learning, the number of machines for parallel learning application
-  * Need to set this in both socket and mpi version.
-* `local_listen_port`, default=`12400`, type=int, alias=`local_port`
-  * TCP listen port for local machines.
-  * Should allow this port in firewall setting before training.
-* `time_out`, default=`120`, type=int
-  * Socket time-out in minutes.
-* `machine_list_file`, default=`""`, type=string
-  * File that list machines for this parallel learning application
-  * Each line contains one IP and one port for one machine. The format is `ip port`, separate by space.
-
-
-## GPU Parameters
-
-* `gpu_platform_id`, default=`-1`, type=int
-  * OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform.
-  * Default value is -1, using the system-wide default platform.
-* `gpu_device_id`, default=`-1`, type=int
-  * OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID.
-  * Default value is -1, using the default device in the selected platform.
-* `gpu_use_dp`, default=`false`, type=bool
-  * Set to true to use double precision math on GPU (default using single precision).
-
-
-## Convert Model Parameters
-
-This feature is only supported in command line version yet.
-
-* `convert_model_language`, default=`""`, type=string
-  * only `cpp` is supported yet.
-  * if `convert_model_language` is set when `task` is set to `train`, the model will also be converted.
-
-* `convert_model`, default=`"gbdt_prediction.cpp"`, type=string
-  * output file name of converted model.
-
-## Others
-
-### Continued Training with Input Score
-
-LightGBM support continued train with initial score. It uses an additional file to store these initial score, like the following:
-
-```
-0.5
--0.1
-0.9
-...
-```
-
-It means the initial score of first data is `0.5`, second is `-0.1`, and so on. The initial score file corresponds with data file line by line, and has per score per line. And if the name of data file is "train.txt", the initial score file should be named as "train.txt.init" and in the same folder as the data file. And LightGBM will auto load initial score file if it exists.
-
-
-### Weight Data
-
-LightGBM support weighted training. It uses an additional file to store weight data, like the following:
-
-```
-1.0
-0.5
-0.8
-...
-```
-
-It means the weight of first data is `1.0`, second is `0.5`, and so on. The weight file corresponds with data file line by line, and has per weight per line. And if the name of data file is "train.txt", the weight file should be named as "train.txt.weight" and in the same folder as the data file. And LightGBM will auto load weight file if it exists.
-
-update:
-You can specific weight column in data file now. Please refer to parameter `weight` in above.
-
-
-### Query Data
-
-For LambdaRank learning, it needs query information for training data. LightGBM use an additional file to store query data. Following is an example:
-
-```
-27
-18
-67
-...
-```
-
-It means first `27` lines samples belong one query and next `18` lines belong to another, and so on.(**Note: data should order by query**) If name of data file is "train.txt", the query file should be named as "train.txt.query" and in same folder of training data. LightGBM will load the query file automatically if it exists.
-
-You can specific query/group id in data file now. Please refer to parameter `group` in above.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
new file mode 100644
index 000000000..296a6a42e
--- /dev/null
+++ b/docs/Parameters.rst
@@ -0,0 +1,730 @@
+Parameters
+==========
+
+This page contains all parameters in LightGBM.
+
+**List of other helpful links**
+
+- `Python API <./Python-API.rst>`__
+
+- `Parameters Tuning <./Parameters-Tuning.rst>`__
+
+**External Links**
+
+- `Laurae++ Interactive Documentation`_
+
+**Update of 08/04/2017**
+
+Default values for the following parameters have changed:
+
+-  ``min_data_in_leaf`` = 100 => 20
+-  ``min_sum_hessian_in_leaf`` = 10 => 1e-3
+-  ``num_leaves`` = 127 => 31
+-  ``num_iterations`` = 10 => 100
+
+Parameters Format
+-----------------
+
+The parameters format is ``key1=value1 key2=value2 ...``.
+And parameters can be set both in config file and command line.
+By using command line, parameters should not have spaces before and after ``=``.
+By using config files, one line can only contain one parameter. You can use ``#`` to comment.
+
+If one parameter appears in both command line and config file, LightGBM will use the parameter in command line.
+
+Core Parameters
+---------------
+
+-  ``config``, default=\ ``""``, type=string, alias=\ ``config_file``
+
+   -  path of config file
+
+-  ``task``, default=\ ``train``, type=enum, options=\ ``train``, ``prediction``
+
+   -  ``train`` for training
+
+   -  ``prediction`` for prediction.
+
+   -  ``convert_model`` for converting model file into if-else format, see more information in `Convert model parameters <#convert-model-parameters>`__
+
+-  ``application``, default=\ ``regression``, type=enum,
+   options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``binary``, ``lambdarank``, ``multiclass``,
+   alias=\ ``objective``, ``app``
+
+   -  ``regression``, regression application
+
+      -  ``regression_l2``, L2 loss, alias=\ ``mean_squared_error``, ``mse``
+
+      -  ``regression_l1``, L1 loss, alias=\ ``mean_absolute_error``, ``mae``
+
+      -  ``huber``, `Huber loss`_
+
+      -  ``fair``, `Fair loss`_
+
+      -  ``poisson``, `Poisson regression`_
+
+   -  ``binary``, binary classification application
+
+   -  ``lambdarank``, `lambdarank`_ application
+
+      -  the label should be ``int`` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
+
+      -  ``label_gain`` can be used to set the gain(weight) of ``int`` label
+
+   -  ``multiclass``, multi-class classification application, ``num_class`` should be set as well
+
+-  ``boosting``, default=\ ``gbdt``, type=enum,
+   options=\ ``gbdt``, ``rf``, ``dart``, ``goss``,
+   alias=\ ``boost``, ``boosting_type``
+
+   -  ``gbdt``, traditional Gradient Boosting Decision Tree
+
+   -  ``rf``, Random Forest
+
+   -  ``dart``, `Dropouts meet Multiple Additive Regression Trees`_
+
+   -  ``goss``, Gradient-based One-Side Sampling
+
+-  ``data``, default=\ ``""``, type=string, alias=\ ``train``, ``train_data``
+
+   -  training data, LightGBM will train from this data
+
+-  ``valid``, default=\ ``""``, type=multi-string, alias=\ ``test``, ``valid_data``, ``test_data``
+
+   -  validation/test data, LightGBM will output metrics for these data
+
+   -  support multi validation data, separate by ``,``
+
+-  ``num_iterations``, default=\ ``100``, type=int,
+   alias=\ ``num_iteration``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``
+
+   -  number of boosting iterations
+   -  **Note**: for Python/R package, **this parameter is ignored**,
+      use ``num_boost_round`` (Python) or ``nrounds`` (R) input arguments of ``train`` and ``cv`` methods instead
+
+   -  **Note**: internally, LightGBM constructs ``num_class * num_iterations`` trees for ``multiclass`` problems
+
+-  ``learning_rate``, default=\ ``0.1``, type=double, alias=\ ``shrinkage_rate``
+
+   -  shrinkage rate
+
+   -  in ``dart``, it also affects on normalization weights of dropped trees
+
+-  ``num_leaves``, default=\ ``31``, type=int, alias=\ ``num_leaf``
+
+   -  number of leaves in one tree
+
+-  ``tree_learner``, default=\ ``serial``, type=enum, options=\ ``serial``, ``feature``, ``data``
+
+   -  ``serial``, single machine tree learner
+
+   -  ``feature``, feature parallel tree learner
+
+   -  ``data``, data parallel tree learner
+
+   -  refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
+
+-  ``num_threads``, default=\ ``OpenMP_default``, type=int, alias=\ ``num_thread``, ``nthread``
+
+   -  number of threads for LightGBM
+
+   -  for the best speed, set this to the number of **real CPU cores**,
+      not the number of threads (most CPU using `hyper-threading`_ to generate 2 threads per CPU core)
+
+   -  do not set it too large if your dataset is small (do not use 64 threads for a dataset with 10,000 rows for instance)
+
+   -  be aware a task manager or any similar CPU monitoring tool might report cores not being fully utilized. **This is normal**
+
+   -  for parallel learning, should not use full CPU cores since this will cause poor performance for the network
+
+-  ``device``, default=\ ``cpu``, options=\ ``cpu``, ``gpu``
+
+   -  choose device for the tree learning, you can use GPU to achieve the faster learning
+
+   -  **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
+
+   -  **Note**: for the faster speed, GPU use 32-bit float point to sum up by default, may affect the accuracy for some tasks.
+      You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
+
+   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build with GPU
+
+Learning Control Parameters
+---------------------------
+
+-  ``max_depth``, default=\ ``-1``, type=int
+
+   -  limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows by leaf-wise
+
+   -  ``< 0`` means no limit
+
+-  ``min_data_in_leaf``, default=\ ``20``, type=int, alias=\ ``min_data_per_leaf`` , ``min_data``
+
+   -  minimal number of data in one leaf. Can be used to deal with over-fitting
+
+-  ``min_sum_hessian_in_leaf``, default=\ ``1e-3``, type=double,
+   alias=\ ``min_sum_hessian_per_leaf``, ``min_sum_hessian``, ``min_hessian``
+
+   -  minimal sum hessian in one leaf. Like ``min_data_in_leaf``, it can be used to deal with over-fitting
+
+-  ``feature_fraction``, default=\ ``1.0``, type=double, ``0.0 < feature_fraction < 1.0``, alias=\ ``sub_feature``
+
+   -  LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``.
+      For example, if set to ``0.8``, will select 80% features before training each tree
+
+   -  can be used to speed up training
+
+   -  can be used to deal with over-fitting
+
+-  ``feature_fraction_seed``, default=\ ``2``, type=int
+
+   -  random seed for ``feature_fraction``
+
+-  ``bagging_fraction``, default=\ ``1.0``, type=double, ``0.0 < bagging_fraction < 1.0``, alias=\ ``sub_row``
+
+   -  like ``feature_fraction``, but this will randomly select part of data without resampling
+
+   -  can be used to speed up training
+
+   -  can be used to deal with over-fitting
+
+   -  **Note**: To enable bagging, ``bagging_freq`` should be set to a non zero value as well
+
+-  ``bagging_freq``, default=\ ``0``, type=int
+
+   -  frequency for bagging, ``0`` means disable bagging. ``k`` means will perform bagging at every ``k`` iteration
+
+   -  **Note**: to enable bagging, ``bagging_fraction`` should be set as well
+
+-  ``bagging_seed`` , default=\ ``3``, type=int
+
+   -  random seed for bagging
+
+-  ``early_stopping_round``, default=\ ``0``, type=int, alias=\ ``early_stopping_rounds``, ``early_stopping``
+
+   -  will stop training if one metric of one validation data doesn't improve in last ``early_stopping_round`` rounds
+
+-  ``lambda_l1``, default=\ ``0``, type=double
+
+   -  L1 regularization
+
+-  ``lambda_l2``, default=\ ``0``, type=double
+
+   -  L2 regularization
+
+-  ``min_gain_to_split``, default=\ ``0``, type=double
+
+   -  the minimal gain to perform split
+
+-  ``drop_rate``, default=\ ``0.1``, type=double
+
+   -  only used in ``dart``
+
+-  ``skip_drop``, default=\ ``0.5``, type=double
+
+   -  only used in ``dart``, probability of skipping drop
+
+-  ``max_drop``, default=\ ``50``, type=int
+
+   -  only used in ``dart``, max number of dropped trees on one iteration
+   
+   -  ``<=0`` means no limit
+
+-  ``uniform_drop``, default=\ ``false``, type=bool
+
+   -  only used in ``dart``, set this to ``true`` if want to use uniform drop
+
+-  ``xgboost_dart_mode``, default=\ ``false``, type=bool
+
+   -  only used in ``dart``, set this to ``true`` if want to use xgboost dart mode
+
+-  ``drop_seed``, default=\ ``4``, type=int
+
+   -  only used in ``dart``, random seed to choose dropping models
+
+-  ``top_rate``, default=\ ``0.2``, type=double
+
+   -  only used in ``goss``, the retain ratio of large gradient data
+
+-  ``other_rate``, default=\ ``0.1``, type=int
+
+   -  only used in ``goss``, the retain ratio of small gradient data
+
+-  ``max_cat_group``, default=\ ``64``, type=int
+
+   -  use for the categorical features
+
+   -  when ``#catogory`` is large, finding the split point on it is easily over-fitting.
+      So LightGBM merges them into ``max_cat_group`` groups, and finds the split points on the group boundaries
+
+-  ``min_data_per_group``, default=\ ``10``, type=int
+
+   -  min number of data per categorical group
+
+-  ``max_cat_threshold``, default=\ ``256``, type=int
+
+   -  use for the categorical features
+
+   -  limit the max threshold points in categorical features
+
+-  ``min_cat_smooth``, default=\ ``5``, type=double
+
+   -  use for the categorical features
+
+   -  refer to the descrption of the paramater ``cat_smooth_ratio``
+
+-  ``max_cat_smooth``, default=\ ``100``, type=double
+
+   -  use for the categorical features
+
+   -  refer to the descrption of the paramater ``cat_smooth_ratio``
+
+-  ``cat_smooth_ratio``, default=\ ``0.01``, type=double
+
+   -  use for the categorical features
+
+   - this can reduce the effect of noises in categorical features, especially for categories with few data
+
+   -  the smooth denominator is ``a = min(max_cat_smooth, max(min_cat_smooth, num_data / num_category * cat_smooth_ratio))``
+
+   -  the smooth numerator is ``b = a * sum_gradient / sum_hessian``
+
+IO Parameters
+-------------
+
+-  ``max_bin``, default=\ ``255``, type=int
+
+   -  max number of bins that feature values will be bucketed in.
+      Small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
+
+   -  LightGBM will auto compress memory according ``max_bin``.
+      For example, LightGBM will use ``uint8_t`` for feature value if ``max_bin=255``
+
+-  ``min_data_in_bin``, default=\ ``5``, type=int
+
+   -  min number of data inside one bin, use this to avoid one-data-one-bin (may over-fitting)
+
+-  ``data_random_seed``, default=\ ``1``, type=int
+
+   -  random seed for data partition in parallel learning (not include feature parallel)
+
+-  ``output_model``, default=\ ``LightGBM_model.txt``, type=string, alias=\ ``model_output``, ``model_out``
+
+   -  file name of output model in training
+
+-  ``input_model``, default=\ ``""``, type=string, alias=\ ``model_input``, ``model_in``
+
+   -  file name of input model
+
+   -  for ``prediction`` task, this model will be used for prediction data
+
+   -  for ``train`` task, training will be continued from this model
+
+-  ``output_result``, default=\ ``LightGBM_predict_result.txt``,
+   type=string, alias=\ ``predict_result``, ``prediction_result``
+
+   -  file name of prediction result in ``prediction`` task
+
+-  ``is_pre_partition``, default=\ ``false``, type=bool
+
+   -  used for parallel learning (not include feature parallel)
+
+   -  ``true`` if training data are pre-partitioned, and different machines use different partitions
+
+-  ``is_sparse``, default=\ ``true``, type=bool, alias=\ ``is_enable_sparse``
+
+   -  used to enable/disable sparse optimization. Set to ``false`` to disable sparse optimization
+
+-  ``two_round``, default=\ ``false``, type=bool, alias=\ ``two_round_loading``, ``use_two_round_loading``
+
+   -  by default, LightGBM will map data file to memory and load features from memory.
+      This will provide faster data loading speed. But it may run out of memory when the data file is very big
+
+   -  set this to ``true`` if data file is too big to fit in memory
+
+-  ``save_binary``, default=\ ``false``, type=bool, alias=\ ``is_save_binary``, ``is_save_binary_file``
+
+   -  if ``true`` LightGBM will save the dataset (include validation data) to a binary file.
+      Speed up the data loading for the next time
+
+-  ``verbosity``, default=\ ``1``, type=int, alias=\ ``verbose``
+
+   -  ``<0`` = Fatal,
+      ``=0`` = Error (Warn),
+      ``>0`` = Info
+
+-  ``header``, default=\ ``false``, type=bool, alias=\ ``has_header``
+
+   -  set this to ``true`` if input data has header
+
+-  ``label``, default=\ ``""``, type=string, alias=\ ``label_column``
+
+   -  specify the label column
+
+   -  use number for index, e.g. ``label=0`` means column\_0 is the label
+
+   -  add a prefix ``name:`` for column name, e.g. ``label=name:is_click``
+
+-  ``weight``, default=\ ``""``, type=string, alias=\ ``weight_column``
+
+   -  specify the weight column
+
+   -  use number for index, e.g. ``weight=0`` means column\_0 is the weight
+
+   -  add a prefix ``name:`` for column name, e.g. ``weight=name:weight``
+
+   -  **Note**: index starts from ``0``.
+      And it doesn't count the label column when passing type is Index, e.g. when label is column\_0, and weight is column\_1, the correct parameter is ``weight=0``
+
+-  ``query``, default=\ ``""``, type=string, alias=\ ``query_column``, ``group``, ``group_column``
+
+   -  specify the query/group id column
+
+   -  use number for index, e.g. ``query=0`` means column\_0 is the query id
+
+   -  add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
+
+   -  **Note**: data should be grouped by query\_id.
+      Index starts from ``0``.
+      And it doesn't count the label column when passing type is Index, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
+
+-  ``ignore_column``, default=\ ``""``, type=string, alias=\ ``ignore_feature``, ``blacklist``
+
+   -  specify some ignoring columns in training
+
+   -  use number for index, e.g. ``ignore_column=0,1,2`` means column\_0, column\_1 and column\_2 will be ignored
+
+   -  add a prefix ``name:`` for column name, e.g. ``ignore_column=name:c1,c2,c3`` means c1, c2 and c3 will be ignored
+
+   -  **Note**: index starts from ``0``. And it doesn't count the label column
+
+-  ``categorical_feature``, default=\ ``""``, type=string, alias=\ ``categorical_column``, ``cat_feature``, ``cat_column``
+
+   -  specify categorical features
+
+   -  use number for index, e.g. ``categorical_feature=0,1,2`` means column\_0, column\_1 and column\_2 are categorical features
+
+   -  add a prefix ``name:`` for column name, e.g. ``categorical_feature=name:c1,c2,c3`` means c1, c2 and c3 are categorical features
+
+   -  **Note**: only supports categorical with ``int`` type. Index starts from ``0``. And it doesn't count the label column
+
+   -  **Note**: the negative values will be treated as **missing values**
+
+-  ``predict_raw_score``, default=\ ``false``, type=bool, alias=\ ``raw_score``, ``is_predict_raw_score``
+
+   -  only used in ``prediction`` task
+
+   -  set to ``true`` to predict only the raw scores
+
+   -  set to ``false`` to predict transformed scores
+
+-  ``predict_leaf_index``, default=\ ``false``, type=bool, alias=\ ``leaf_index``, ``is_predict_leaf_index``
+
+   -  only used in ``prediction`` task
+
+   -  set to ``true`` to predict with leaf index of all trees
+
+-  ``predict_contrib``, default=\ ``false``, type=bool, alias=\ ``contrib``, ``is_predict_contrib``
+
+   -  only used in ``prediction`` task
+
+   -  set to ``true`` to estimate `SHAP values`_, which represent how each feature contributs to each prediction.
+      Produces number of features + 1 values where the last value is the expected value of the model output over the training data
+
+-  ``bin_construct_sample_cnt``, default=\ ``200000``, type=int
+
+   -  number of data that sampled to construct histogram bins
+
+   -  will give better training result when set this larger, but will increase data loading time
+
+   -  set this to larger value if data is very sparse
+
+-  ``num_iteration_predict``, default=\ ``-1``, type=int
+
+   -  only used in ``prediction`` task
+   -  use to specify how many trained iterations will be used in prediction
+
+   -  ``<= 0`` means no limit
+
+-  ``pred_early_stop``, default=\ ``false``, type=bool
+
+   -  if ``true`` will use early-stopping to speed up the prediction. May affect the accuracy
+
+-  ``pred_early_stop_freq``, default=\ ``10``, type=int
+
+   -  the frequency of checking early-stopping prediction
+
+-  ``pred_early_stop_margin``, default=\ ``10.0``, type=double
+
+   -  the threshold of margin in early-stopping prediction
+
+-  ``use_missing``, default=\ ``true``, type=bool
+
+   -  set to ``false`` to disable the special handle of missing value
+
+-  ``zero_as_missing``, default=\ ``false``, type=bool
+
+   -  set to ``true`` to treat all zero as missing values (including the unshown values in libsvm/sparse matrics)
+
+   -  set to ``false`` to use ``na`` to represent missing values
+
+-  ``init_score_file``, default=\ ``""``, type=string
+
+   -  path to training initial score file, ``""`` will use ``train_data_file`` + ``.init`` (if exists)
+
+-  ``valid_init_score_file``, default=\ ``""``, type=multi-string
+
+   -  path to validation initial score file, ``""`` will use ``valid_data_file`` + ``.init`` (if exists)
+
+   -  separate by ``,`` for multi-validation data
+
+Objective Parameters
+--------------------
+
+-  ``sigmoid``, default=\ ``1.0``, type=double
+
+   -  parameter for sigmoid function. Will be used in ``binary`` classification and ``lambdarank``
+
+-  ``huber_delta``, default=\ ``1.0``, type=double
+
+   -  parameter for `Huber loss`_. Will be used in ``regression`` task
+
+-  ``fair_c``, default=\ ``1.0``, type=double
+
+   -  parameter for `Fair loss`_. Will be used in ``regression`` task
+
+-  ``gaussian_eta``, default=\ ``1.0``, type=double
+
+   -  parameter to control the width of Gaussian function. Will be used in ``regression_l1`` and ``huber`` losses
+
+-  ``poission_max_delta_step``, default=\ ``0.7``, type=double
+
+   -  parameter used to safeguard optimization
+
+-  ``scale_pos_weight``, default=\ ``1.0``, type=double
+
+   -  weight of positive class in ``binary`` classification task
+
+-  ``boost_from_average``, default=\ ``true``, type=bool
+
+   -  only used in ``regression`` task
+
+   -  adjust initial score to the mean of labels for faster convergence
+
+-  ``is_unbalance``, default=\ ``false``, type=bool
+
+   -  used in ``binary`` classification
+   
+   -  set this to ``true`` if training data are unbalance
+
+-  ``max_position``, default=\ ``20``, type=int
+
+   -  used in ``lambdarank``
+
+   -  will optimize `NDCG`_ at this position
+
+-  ``label_gain``, default=\ ``0,1,3,7,15,31,63,...``, type=multi-double
+
+   -  used in ``lambdarank``
+
+   -  relevant gain for labels. For example, the gain of label ``2`` is ``3`` if using default label gains
+
+   -  separate by ``,``
+
+-  ``num_class``, default=\ ``1``, type=int, alias=\ ``num_classes``
+
+   -  only used in ``multiclass`` classification
+
+Metric Parameters
+-----------------
+
+-  ``metric``, default={``l2`` for regression}, {``binary_logloss`` for binary classification}, {``ndcg`` for lambdarank}, type=multi-enum,
+   options=\ ``l1``, ``l2``, ``ndcg``, ``auc``, ``binary_logloss``, ``binary_error`` ...
+
+   -  ``l1``, absolute loss, alias=\ ``mean_absolute_error``, ``mae``
+
+   -  ``l2``, square loss, alias=\ ``mean_squared_error``, ``mse``
+
+   -  ``l2_root``, root square loss, alias=\ ``root_mean_squared_error``, ``rmse``
+
+   -  ``huber``, `Huber loss`_
+
+   -  ``fair``, `Fair loss`_
+
+   -  ``poisson``, `Poisson regression`_
+
+   -  ``ndcg``, `NDCG`_
+
+   -  ``map``, `MAP`_
+
+   -  ``auc``, `AUC`_
+
+   -  ``binary_logloss``, `log loss`_
+
+   -  ``binary_error``.
+      For one sample: ``0`` for correct classification, ``1`` for error classification
+
+   -  ``multi_logloss``, log loss for mulit-class classification
+
+   -  ``multi_error``, error rate for mulit-class classification
+
+   -  support multi metrics, separated by ``,``
+
+-  ``metric_freq``, default=\ ``1``, type=int
+
+   -  frequency for metric output
+
+-  ``is_training_metric``, default=\ ``false``, type=bool
+
+   -  set this to ``true`` if you need to output metric result of training
+
+-  ``ndcg_at``, default=\ ``1,2,3,4,5``, type=multi-int, alias=\ ``ndcg_eval_at``, ``eval_at``
+
+   -  `NDCG`_ evaluation positions, separated by ``,``
+
+Network Parameters
+------------------
+
+Following parameters are used for parallel learning, and only used for base (socket) version.
+
+-  ``num_machines``, default=\ ``1``, type=int, alias=\ ``num_machine``
+
+   -  used for parallel learning, the number of machines for parallel learning application
+
+   -  need to set this in both socket and mpi versions
+
+-  ``local_listen_port``, default=\ ``12400``, type=int, alias=\ ``local_port``
+
+   -  TCP listen port for local machines
+
+   -  you should allow this port in firewall settings before training
+
+-  ``time_out``, default=\ ``120``, type=int
+
+   -  socket time-out in minutes
+
+-  ``machine_list_file``, default=\ ``""``, type=string
+
+   -  file that lists machines for this parallel learning application
+
+   -  each line contains one IP and one port for one machine. The format is ``ip port``, separate by space
+
+GPU Parameters
+--------------
+
+-  ``gpu_platform_id``, default=\ ``-1``, type=int
+
+   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform.
+
+   -  default value is ``-1``, means the system-wide default platform
+
+-  ``gpu_device_id``, default=\ ``-1``, type=int
+
+   -  OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+
+   -  default value is ``-1``, means the default device in the selected platform
+
+-  ``gpu_use_dp``, default=\ ``false``, type=bool
+
+   -  set to ``true`` to use double precision math on GPU (default using single precision)
+  
+Convert Model Parameters
+------------------------
+
+This feature is only supported in command line version yet.
+
+-  ``convert_model_language``, default=\ ``""``, type=string
+
+   -  only ``cpp`` is supported yet
+
+   -  if ``convert_model_language`` is set when ``task`` is set to ``train``, the model will also be converted
+
+-  ``convert_model``, default=\ ``"gbdt_prediction.cpp"``, type=string
+
+   -  output file name of converted model
+
+Others
+------
+
+Continued Training with Input Score
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LightGBM supports continued training with initial scores. It uses an additional file to store these initial scores, like the following:
+
+::
+
+    0.5
+    -0.1
+    0.9
+    ...
+
+It means the initial score of the first data row is ``0.5``, second is ``-0.1``, and so on.
+The initial score file corresponds with data file line by line, and has per score per line.
+And if the name of data file is ``train.txt``, the initial score file should be named as ``train.txt.init`` and in the same folder as the data file.
+In this case LightGBM will auto load initial score file if it exists.
+
+Weight Data
+~~~~~~~~~~~
+
+LightGBM supporta weighted training. It uses an additional file to store weight data, like the following:
+
+::
+
+    1.0
+    0.5
+    0.8
+    ...
+
+It means the weight of the first data row is ``1.0``, second is ``0.5``, and so on.
+The weight file corresponds with data file line by line, and has per weight per line.
+And if the name of data file is ``train.txt``, the weight file should be named as ``train.txt.weight`` and in the same folder as the data file.
+In this case LightGBM will auto load weight file if it exists.
+
+**update**:
+You can specific weight column in data file now. Please refer to parameter ``weight`` in above.
+
+Query Data
+~~~~~~~~~~
+
+For LambdaRank learning, it needs query information for training data.
+LightGBM use an additional file to store query data, like the following:
+
+::
+
+    27
+    18
+    67
+    ...
+
+It means first ``27`` lines samples belong one query and next ``18`` lines belong to another, and so on.
+
+**Note**: data should be ordered by the query.
+
+If the name of data file is ``train.txt``, the query file should be named as ``train.txt.query`` and in same folder of training data.
+In this case LightGBM will load the query file automatically if it exists.
+
+**update**:
+You can specific query/group id in data file now. Please refer to parameter ``group`` in above.
+
+.. _Laurae++ Interactive Documentation: https://sites.google.com/view/lauraepp/parameters
+
+.. _Huber loss: https://en.wikipedia.org/wiki/Huber_loss
+
+.. _Fair loss: https://www.kaggle.com/c/allstate-claims-severity/discussion/24520
+
+.. _Poisson regression: https://en.wikipedia.org/wiki/Poisson_regression
+
+.. _lambdarank: https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
+
+.. _Dropouts meet Multiple Additive Regression Trees: https://arxiv.org/abs/1505.01866
+
+.. _hyper-threading: https://en.wikipedia.org/wiki/Hyper-threading
+
+.. _SHAP values: https://arxiv.org/abs/1706.06060
+
+.. _NDCG: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG
+
+.. _MAP: https://en.wikipedia.org/wiki/Information_retrieval#Mean_average_precision
+
+.. _AUC: https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+
+.. _log loss: https://www.kaggle.com/wiki/LogLoss
diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst
new file mode 100644
index 000000000..693dab44e
--- /dev/null
+++ b/docs/Python-Intro.rst
@@ -0,0 +1,222 @@
+Python Package Introduction
+===========================
+
+This document gives a basic walkthrough of LightGBM Python-package.
+
+**List of other helpful links**
+
+-  `Python Examples <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
+
+-  `Python API <./Python-API.rst>`__
+
+-  `Parameters Tuning <./Parameters-Tuning.rst>`__
+
+Install
+-------
+
+Install Python-package dependencies,
+``setuptools``, ``wheel``, ``numpy`` and ``scipy`` are required, ``scikit-learn`` is required for sklearn interface and recommended:
+
+::
+
+    pip install setuptools wheel numpy scipy scikit-learn -U
+
+Refer to `Python-package`_ folder for the installation guide.
+
+To verify your installation, try to ``import lightgbm`` in Python:
+
+::
+
+    import lightgbm as lgb
+
+Data Interface
+--------------
+
+The LightGBM Python module is able to load data from:
+
+-  libsvm/tsv/csv txt format file
+
+-  Numpy 2D array, pandas object
+
+-  LightGBM binary file
+
+The data is stored in a ``Dataset`` object.
+
+**To load a libsvm text file or a LightGBM binary file into Dataset:**
+
+.. code:: python
+
+    train_data = lgb.Dataset('train.svm.bin')
+
+**To load a numpy array into Dataset:**
+
+.. code:: python
+
+    data = np.random.rand(500, 10)  # 500 entities, each contains 10 features
+    label = np.random.randint(2, size=500)  # binary target
+    train_data = lgb.Dataset(data, label=label)
+
+**To load a scpiy.sparse.csr\_matrix array into Dataset:**
+
+.. code:: python
+
+    csr = scipy.sparse.csr_matrix((dat, (row, col)))
+    train_data = lgb.Dataset(csr)
+
+**Saving Dataset into a LightGBM binary file will make loading faster:**
+
+.. code:: python
+
+    train_data = lgb.Dataset('train.svm.txt')
+    train_data.save_binary('train.bin')
+
+**Create validation data:**
+
+.. code:: python
+
+    test_data = train_data.create_valid('test.svm')
+
+or
+
+.. code:: python
+
+    test_data = lgb.Dataset('test.svm', reference=train_data)
+
+In LightGBM, the validation data should be aligned with training data.
+
+**Specific feature names and categorical features:**
+
+.. code:: python
+
+    train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
+
+LightGBM can use categorical features as input directly.
+It doesn't need to covert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).
+
+**Note**: You should convert your categorical features to ``int`` type before you construct ``Dataset``.
+
+**Weights can be set when needed:**
+
+.. code:: python
+
+    w = np.random.rand(500, )
+    train_data = lgb.Dataset(data, label=label, weight=w)
+
+or
+
+.. code:: python
+
+    train_data = lgb.Dataset(data, label=label)
+    w = np.random.rand(500, )
+    train_data.set_weight(w)
+
+And you can use ``Dataset.set_init_score()`` to set initial score, and ``Dataset.set_group()`` to set group/query data for ranking tasks.
+
+**Memory efficent usage:**
+
+The ``Dataset`` object in LightGBM is very memory-efficient, due to it only need to save discrete bins.
+However, Numpy/Array/Pandas object is memory cost.
+If you concern about your memory consumption. You can save memory accroding to following:
+
+1. Let ``free_raw_data=True`` (default is ``True``) when constructing the ``Dataset``
+
+2. Explicit set ``raw_data=None`` after the ``Dataset`` has been constructed
+
+3. Call ``gc``
+
+Setting Parameters
+------------------
+
+LightGBM can use either a list of pairs or a dictionary to set `Parameters <./Parameters.rst>`__.
+For instance:
+
+-  Booster parameters:
+
+   .. code:: python
+
+       param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
+       param['metric'] = 'auc'
+
+-  You can also specify multiple eval metrics:
+
+   .. code:: python
+
+       param['metric'] = ['auc', 'binary_logloss']
+
+Training
+--------
+
+Training a model requires a parameter list and data set:
+
+.. code:: python
+
+    num_round = 10
+    bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
+
+After training, the model can be saved:
+
+.. code:: python
+
+    bst.save_model('model.txt')
+
+The trained model can also be dumped to JSON format:
+
+.. code:: python
+
+    json_model = bst.dump_model()
+
+A saved model can be loaded.
+
+.. code:: python
+
+    bst = lgb.Booster(model_file='model.txt')  #init model
+
+CV
+--
+
+Training with 5-fold CV:
+
+.. code:: python
+
+    num_round = 10
+    lgb.cv(param, train_data, num_round, nfold=5)
+
+Early Stopping
+--------------
+
+If you have a validation set, you can use early stopping to find the optimal number of boosting rounds.
+Early stopping requires at least one set in ``valid_sets``. If there is more than one, it will use all of them:
+
+.. code:: python
+
+    bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, early_stopping_rounds=10)
+    bst.save_model('model.txt', num_iteration=bst.best_iteration)
+
+The model will train until the validation score stops improving.
+Validation error needs to improve at least every ``early_stopping_rounds`` to continue training.
+
+If early stopping occurs, the model will have an additional field: ``bst.best_iteration``.
+Note that ``train()`` will return a model from the last iteration, not the best one.
+And you can set ``num_iteration=bst.best_iteration`` when saving model.
+
+This works with both metrics to minimize (L2, log loss, etc.) and to maximize (NDCG, AUC).
+Note that if you specify more than one evaluation metric, all of them will be used for early stopping.
+
+Prediction
+----------
+
+A model that has been trained or loaded can perform predictions on data sets:
+
+.. code:: python
+
+    # 7 entities, each contains 10 features
+    data = np.random.rand(7, 10)
+    ypred = bst.predict(data)
+
+If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_iteration``:
+
+.. code:: python
+
+    ypred = bst.predict(data, num_iteration=bst.best_iteration)
+
+.. _Python-package: https://github.com/Microsoft/LightGBM/tree/master/python-package
diff --git a/docs/Python-intro.md b/docs/Python-intro.md
deleted file mode 100644
index 6e3671fa9..000000000
--- a/docs/Python-intro.md
+++ /dev/null
@@ -1,206 +0,0 @@
-Python Package Introduction
-===========================
-
-This document gives a basic walkthrough of LightGBM Python-package.
-
-***List of other Helpful Links***
-* [Python Examples](https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide)
-* [Python API](./Python-API.rst)
-* [Parameters Tuning](./Parameters-tuning.md)
-
-Install
--------
-
-Install Python-package dependencies, `setuptools`, `wheel`, `numpy` and `scipy` are required, `scikit-learn` is required for sklearn interface and recommended:
-
-```
-pip install setuptools wheel numpy scipy scikit-learn -U
-```
-
-Refer to [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) folder for the installation guide.
-
-To verify your installation, try to `import lightgbm` in Python:
-
-```
-import lightgbm as lgb
-```
-
-Data Interface
---------------
-
-The LightGBM Python module is able to load data from:
-
-- libsvm/tsv/csv txt format file
-- Numpy 2D array, pandas object
-- LightGBM binary file
-
-The data is stored in a ```Dataset``` object.
-
-#### To load a libsvm text file or a LightGBM binary file into ```Dataset```:
-
-```python
-train_data = lgb.Dataset('train.svm.bin')
-```
-
-####  To load a numpy array into ```Dataset```:
-
-```python
-data = np.random.rand(500, 10) # 500 entities, each contains 10 features
-label = np.random.randint(2, size=500) # binary target
-train_data = lgb.Dataset(data, label=label)
-```
-
-#### To load a scpiy.sparse.csr_matrix array into ```Dataset```:
-
-```python
-csr = scipy.sparse.csr_matrix((dat, (row, col)))
-train_data = lgb.Dataset(csr)
-```
-
-#### Saving ```Dataset``` into a LightGBM binary file will make loading faster:
-
-```python
-train_data = lgb.Dataset('train.svm.txt')
-train_data.save_binary('train.bin')
-```
-
-#### Create validation data:
-
-```python
-test_data = train_data.create_valid('test.svm')
-```
-
-or 
-
-```python
-test_data = lgb.Dataset('test.svm', reference=train_data)
-```
-
-In LightGBM, the validation data should be aligned with training data.
-
-#### Specific feature names and categorical features:
-
-```python
-train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
-```
-
-LightGBM can use categorical features as input directly. It doesn't need to covert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).
-
-**Note**:You should convert your categorical features to int type before you construct `Dataset`.
-
-#### Weights can be set when needed:
-
-```python
-w = np.random.rand(500, )
-train_data = lgb.Dataset(data, label=label, weight=w)
-```
-
-or
-
-```python
-train_data = lgb.Dataset(data, label=label)
-w = np.random.rand(500, )
-train_data.set_weight(w)
-```
-
-And you can use `Dataset.set_init_score()` to set initial score, and `Dataset.set_group()` to set group/query data for ranking tasks.
-
-#### Memory efficent usage
-
-The `Dataset` object in LightGBM is very memory-efficient, due to it only need to save discrete bins.
-However, Numpy/Array/Pandas object is memory cost. If you concern about your memory consumption. You can save memory accroding to following:
-
-1. Let ```free_raw_data=True```(default is ```True```) when constructing the ```Dataset```
-2. Explicit set ```raw_data=None``` after the ```Dataset``` has been constructed
-3. Call ```gc```  
-
-Setting Parameters
-------------------
-
-LightGBM can use either a list of pairs or a dictionary to set [Parameters](./Parameters.md). For instance:
-
-* Booster parameters:
-
-```python
-param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
-param['metric'] = 'auc'
-```
-
-* You can also specify multiple eval metrics:
-
-```python
-param['metric'] = ['auc', 'binary_logloss']
-```
-
-Training
---------
-
-Training a model requires a parameter list and data set.
-
-```python
-num_round = 10
-bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
-```
-
-After training, the model can be saved.
-
-```python
-bst.save_model('model.txt')
-```
-
-The trained model can also be dumped to JSON format.
-
-```python
-# dump model
-json_model = bst.dump_model()
-```
-
-A saved model can be loaded.
-
-```python
-bst = lgb.Booster(model_file='model.txt') #init model
-```
-
-CV
---
-
-Training with 5-fold CV:
-
-```python
-num_round = 10
-lgb.cv(param, train_data, num_round, nfold=5)
-```
-
-Early Stopping
---------------
-
-If you have a validation set, you can use early stopping to find the optimal number of boosting rounds.
-Early stopping requires at least one set in `valid_sets`. If there's more than one, it will use all of them.
-
-```python
-bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, early_stopping_rounds=10)
-bst.save_model('model.txt', num_iteration=bst.best_iteration)
-```
-
-The model will train until the validation score stops improving. Validation error needs to improve at least every `early_stopping_rounds` to continue training.
-
-If early stopping occurs, the model will have an additional field: `bst.best_iteration`. Note that `train()` will return a model from the last iteration, not the best one. And you can set `num_iteration=bst.best_iteration` when saving model.
-
-This works with both metrics to minimize (L2, log loss, etc.) and to maximize (NDCG, AUC). Note that if you specify more than one evaluation metric, all of them will be used for early stopping.
-
-Prediction
-----------
-
-A model that has been trained or loaded can perform predictions on data sets.
-
-```python
-# 7 entities, each contains 10 features
-data = np.random.rand(7, 10)
-ypred = bst.predict(data)
-```
-
-If early stopping is enabled during training, you can get predictions from the best iteration with `bst.best_iteration`:
-
-```python
-ypred = bst.predict(data, num_iteration=bst.best_iteration)
-```
diff --git a/docs/Quick-Start.md b/docs/Quick-Start.md
deleted file mode 100644
index 2fed71e66..000000000
--- a/docs/Quick-Start.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Quick Start
-
-This is a quick start guide for LightGBM of cli version.
-
-Follow the [Installation Guide](./Installation-Guide.rst) to install LightGBM first.
-
-***List of other Helpful Links***
-* [Parameters](./Parameters.md)
-* [Parameters Tuning](./Parameters-tuning.md)
-* [Python-package Quick Start](./Python-intro.md)
-* [Python API](./Python-API.rst)
-
-## Training Data Format 
-
-LightGBM supports input data file with [CSV](https://en.wikipedia.org/wiki/Comma-separated_values), [TSV](https://en.wikipedia.org/wiki/Tab-separated_values) and [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) formats.
-
-Label is the data of first column, and there is no header in the file.
-
-### Categorical Feature Support
-
-update 12/5/2016:
-
-LightGBM can use categorical feature directly (without one-hot coding). The experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up compared with one-hot coding.
-
-For the setting details, please refer to [Parameters](./Parameters.md).
-
-### Weight and Query/Group Data
-
-LightGBM also support weighted training, it needs an additional [weight data](./Parameters.md). And it needs an additional [query data](./Parameters.md) for ranking task.
-
-update 11/3/2016:
-
-1. support input with header now
-2. can specific label column, weight column and query/group id column. Both index and column are supported
-3. can specific a list of ignored columns
-
-## Parameter Quick Look
-
-The parameter format is ```key1=value1 key2=value2 ... ``` . And parameters can be in both config file and command line.
-
-Some important parameters:
-
-* ```config```, default=```""```, type=string, alias=```config_file```
-  * path of config file
-* ```task```, default=```train```, type=enum, options=```train```,```prediction```
-  * ```train``` for training
-  * ```prediction``` for prediction.
-* `application`, default=`regression`, type=enum, options=`regression`,`regression_l1`,`huber`,`fair`,`poisson`,`binary`,`lambdarank`,`multiclass`, alias=`objective`,`app`
-  * `regression`, regression application
-    * `regression_l2`, L2 loss, alias=`mean_squared_error`,`mse`
-    * `regression_l1`, L1 loss, alias=`mean_absolute_error`,`mae`
-    * `huber`, [Huber loss](https://en.wikipedia.org/wiki/Huber_loss "Huber loss - Wikipedia")
-    * `fair`, [Fair loss](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)
-    * `poisson`, [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression "Poisson regression")
-  * `binary`, binary classification application 
-  * `lambdarank`, [lambdarank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf) application
-    * The label should be `int` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect).
-    * `label_gain` can be used to set the gain(weight) of `int` label.
-  * `multiclass`, multi-class classification application, should set `num_class` as well
-* `boosting`, default=`gbdt`, type=enum, options=`gbdt`,`rf`,`dart`,`goss`, alias=`boost`,`boosting_type`
-  * `gbdt`, traditional Gradient Boosting Decision Tree 
-  * `rf`, Random Forest
-  * `dart`, [Dropouts meet Multiple Additive Regression Trees](https://arxiv.org/abs/1505.01866)
-  * `goss`, Gradient-based One-Side Sampling
-* ```data```, default=```""```, type=string, alias=```train```,```train_data```
-  * training data, LightGBM will train from this data
-* ```valid```, default=```""```, type=multi-string, alias=```test```,```valid_data```,```test_data```
-  * validation/test data, LightGBM will output metrics for these data
-  * support multi validation data, separate by ```,```
-* ```num_iterations```, default=```100```, type=int, alias=```num_iteration```,```num_tree```,```num_trees```,```num_round```,```num_rounds```
-  * number of boosting iterations/trees
-* ```learning_rate```, default=```0.1```, type=double, alias=```shrinkage_rate```
-  * shrinkage rate
-* ```num_leaves```, default=```31```, type=int, alias=```num_leaf```
-  * number of leaves in one tree
-* ```tree_learner```, default=```serial```, type=enum, options=```serial```,```feature```,```data```
-  * ```serial```, single machine tree learner
-  * ```feature```, feature parallel tree learner
-  * ```data```, data parallel tree learner
-  * Refer to [Parallel Learning Guide](./Parallel-Learning-Guide.rst) to get more details.
-* ```num_threads```, default=OpenMP_default, type=int, alias=```num_thread```,```nthread```
-  * Number of threads for LightGBM. 
-  * For the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPU using [hyper-threading](https://en.wikipedia.org/wiki/Hyper-threading) to generate 2 threads per CPU core).
-  * For parallel learning, should not use full CPU cores since this will cause poor performance for the network.
-* ```max_depth```, default=```-1```, type=int
-  * Limit the max depth for tree model. This is used to deal with overfit when #data is small. Tree still grow by leaf-wise. 
-  * ```< 0``` means no limit 
-* ```min_data_in_leaf```, default=```20```, type=int, alias=```min_data_per_leaf``` , ```min_data```
-  * Minimal number of data in one leaf. Can use this to deal with over-fit.
-* ```min_sum_hessian_in_leaf```, default=```1e-3```, type=double, alias=```min_sum_hessian_per_leaf```, ```min_sum_hessian```, ```min_hessian```
-  * Minimal sum hessian in one leaf. Like ```min_data_in_leaf```, can use this to deal with over-fit.
-
-For all parameters, please refer to [Parameters](./Parameters.md).
-
-## Run LightGBM
-
-For Windows:
-```
-lightgbm.exe config=your_config_file other_args ...
-```
-
-For Unix:
-```
-./lightgbm config=your_config_file other_args ...
-```
-
-Parameters can be both in the config file and command line, and the parameters in command line have higher priority than in config file.
-For example, following command line will keep 'num_trees=10' and ignore same parameter in config file.
-```
-./lightgbm config=train.conf num_trees=10
-```
-
-## Examples
-
-* [Binary Classification](https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification)
-* [Regression](https://github.com/Microsoft/LightGBM/tree/master/examples/regression)
-* [Lambdarank](https://github.com/Microsoft/LightGBM/tree/master/examples/lambdarank)
-* [Parallel Learning](https://github.com/Microsoft/LightGBM/tree/master/examples/parallel_learning)
diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst
new file mode 100644
index 000000000..8199475db
--- /dev/null
+++ b/docs/Quick-Start.rst
@@ -0,0 +1,219 @@
+Quick Start
+===========
+
+This is a quick start guide for LightGBM CLI version.
+
+Follow the `Installation Guide <./Installation-Guide.rst>`__ to install LightGBM first.
+
+**List of other helpful links**
+
+-  `Parameters <./Parameters.rst>`__
+
+-  `Parameters Tuning <./Parameters-Tuning.rst>`__
+
+-  `Python-package Quick Start <./Python-Intro.rst>`__
+
+-  `Python API <./Python-API.rst>`__
+
+Training Data Format
+--------------------
+
+LightGBM supports input data file with `CSV`_, `TSV`_ and `LibSVM`_ formats.
+
+Label is the data of first column, and there is no header in the file.
+
+Categorical Feature Support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+update 12/5/2016:
+
+LightGBM can use categorical feature directly (without one-hot coding).
+The experiment on `Expo data`_ shows about 8x speed-up compared with one-hot coding.
+
+For the setting details, please refer to `Parameters <./Parameters.rst>`__.
+
+Weight and Query/Group Data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LightGBM also support weighted training, it needs an additional `weight data <./Parameters.rst#io-parameters>`__.
+And it needs an additional `query data <./Parameters.rst#io-parameters>`_ for ranking task.
+
+update 11/3/2016:
+
+1. support input with header now
+
+2. can specific label column, weight column and query/group id column.
+   Both index and column are supported
+
+3. can specific a list of ignored columns
+
+Parameter Quick Look
+--------------------
+
+The parameter format is ``key1=value1 key2=value2 ...``.
+And parameters can be in both config file and command line.
+
+Some important parameters:
+
+- ``config``, default=\ ``""``, type=string, alias=\ ``config_file``
+
+  - path to config file
+
+- ``task``, default=\ ``train``, type=enum, options=\ ``train``, ``prediction``
+
+  - ``train`` for training
+
+  - ``prediction`` for prediction
+
+- ``application``, default=\ ``regression``, type=enum,
+  options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``binary``, ``lambdarank``, ``multiclass``,
+  alias=\ ``objective``, ``app``
+
+  - ``regression``, regression application
+
+    - ``regression_l2``, L2 loss, alias=\ ``mean_squared_error``, ``mse``
+
+    - ``regression_l1``, L1 loss, alias=\ ``mean_absolute_error``, ``mae``
+
+    - ``huber``, `Huber loss`_
+
+    - ``fair``, `Fair loss`_
+
+    - ``poisson``, `Poisson regression`_
+
+  - ``binary``, binary classification application
+
+  - ``lambdarank``, `lambdarank`_ application
+
+    - the label should be ``int`` type in lambdarank tasks,
+      and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
+
+    - ``label_gain`` can be used to set the gain(weight) of ``int`` label.
+
+  - ``multiclass``, multi-class classification application, ``num_class`` should be set as well
+
+- ``boosting``, default=\ ``gbdt``, type=enum,
+  options=\ ``gbdt``, ``rf``, ``dart``, ``goss``,
+  alias=\ ``boost``, ``boosting_type``
+
+  - ``gbdt``, traditional Gradient Boosting Decision Tree
+
+  - ``rf``, Random Forest
+
+  - ``dart``, `Dropouts meet Multiple Additive Regression Trees`_
+
+  - ``goss``, Gradient-based One-Side Sampling
+
+- ``data``, default=\ ``""``, type=string, alias=\ ``train``, ``train_data``
+
+  - training data, LightGBM will train from this data
+
+- ``valid``, default=\ ``""``, type=multi-string, alias=\ ``test``, ``valid_data``, ``test_data``
+
+  - validation/test data, LightGBM will output metrics for these data
+
+  - support multi validation data, separate by ``,``
+
+- ``num_iterations``, default=\ ``100``, type=int,
+  alias=\ ``num_iteration``, ``num_tree``, ``num_trees``, ``num_round``, ``num_rounds``
+
+  - number of boosting iterations/trees
+
+- ``learning_rate``, default=\ ``0.1``, type=double, alias=\ ``shrinkage_rate``
+
+  - shrinkage rate
+
+- ``num_leaves``, default=\ ``31``, type=int, alias=\ ``num_leaf``
+
+  - number of leaves in one tree
+
+- ``tree_learner``, default=\ ``serial``, type=enum, options=\ ``serial``, ``feature``, ``data``
+
+  - ``serial``, single machine tree learner
+
+  - ``feature``, feature parallel tree learner
+
+  - ``data``, data parallel tree learner
+
+  - refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
+
+- ``num_threads``, default=\ ``OpenMP_default``, type=int, alias=\ ``num_thread``, ``nthread``
+
+  - number of threads for LightGBM
+
+  - for the best speed, set this to the number of **real CPU cores**,
+    not the number of threads (most CPU using `hyper-threading`_ to generate 2 threads per CPU core)
+
+  - for parallel learning, should not use full CPU cores since this will cause poor performance for the network
+
+- ``max_depth``, default=\ ``-1``, type=int
+
+  - limit the max depth for tree model.
+    This is used to deal with overfit when ``#data`` is small.
+    Tree still grow by leaf-wise
+
+  - ``< 0`` means no limit
+
+- ``min_data_in_leaf``, default=\ ``20``, type=int, alias=\ ``min_data_per_leaf`` , ``min_data``
+
+  - minimal number of data in one leaf. Can use this to deal with over-fitting
+
+- ``min_sum_hessian_in_leaf``, default=\ ``1e-3``, type=double,
+  alias=\ ``min_sum_hessian_per_leaf``, ``min_sum_hessian``, ``min_hessian``
+
+  - minimal sum hessian in one leaf. Like ``min_data_in_leaf``, can be used to deal with over-fitting
+
+For all parameters, please refer to `Parameters <./Parameters.rst>`__.
+
+Run LightGBM
+------------
+
+For Windows:
+
+::
+
+    lightgbm.exe config=your_config_file other_args ...
+
+For Unix:
+
+::
+
+    ./lightgbm config=your_config_file other_args ...
+
+Parameters can be both in the config file and command line, and the parameters in command line have higher priority than in config file.
+For example, following command line will keep ``num_trees=10`` and ignore the same parameter in config file.
+
+::
+
+    ./lightgbm config=train.conf num_trees=10
+
+Examples
+--------
+
+-  `Binary Classification <https://github.com/Microsoft/LightGBM/tree/master/examples/binary_classification>`__
+
+-  `Regression <https://github.com/Microsoft/LightGBM/tree/master/examples/regression>`__
+
+-  `Lambdarank <https://github.com/Microsoft/LightGBM/tree/master/examples/lambdarank>`__
+
+-  `Parallel Learning <https://github.com/Microsoft/LightGBM/tree/master/examples/parallel_learning>`__
+
+.. _CSV: https://en.wikipedia.org/wiki/Comma-separated_values
+
+.. _TSV: https://en.wikipedia.org/wiki/Tab-separated_values
+
+.. _LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/
+
+.. _Expo data: http://stat-computing.org/dataexpo/2009/
+
+.. _Huber loss: https://en.wikipedia.org/wiki/Huber_loss
+
+.. _Fair loss: https://www.kaggle.com/c/allstate-claims-severity/discussion/24520
+
+.. _Poisson regression: https://en.wikipedia.org/wiki/Poisson_regression
+
+.. _lambdarank: https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
+
+.. _Dropouts meet Multiple Additive Regression Trees: https://arxiv.org/abs/1505.01866
+
+.. _hyper-threading: https://en.wikipedia.org/wiki/Hyper-threading
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index f5857f0cb..000000000
--- a/docs/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Documentation
-
-Documentation for LightGBM is generated using [Sphinx](http://www.sphinx-doc.org/) and [recommonmark](https://recommonmark.readthedocs.io/).
-
-After each commit on `master`, documentation is updated and published to [https://lightgbm.readthedocs.io/](https://lightgbm.readthedocs.io/).
-
-## Build
-
-You can build the documentation locally. Just run in `docs` folder:
-
-```sh
-pip install -r requirements.txt
-make html
-```
diff --git a/docs/README.rst b/docs/README.rst
new file mode 100644
index 000000000..0e457b399
--- /dev/null
+++ b/docs/README.rst
@@ -0,0 +1,16 @@
+Documentation
+=============
+
+Documentation for LightGBM is generated using `Sphinx <http://www.sphinx-doc.org/>`__.
+
+After each commit on ``master``, documentation is updated and published to `Read the Docs <https://lightgbm.readthedocs.io/>`__.
+
+Build
+-----
+
+You can build the documentation locally. Just run in ``docs`` folder:
+
+.. code:: sh
+
+    pip install sphinx sphinx_rtd_theme
+    make html
diff --git a/docs/_static/images/gcc-bars.png b/docs/_static/images/gcc-bars.png
new file mode 100644
index 000000000..09a6f33c2
Binary files /dev/null and b/docs/_static/images/gcc-bars.png differ
diff --git a/docs/_static/images/gcc-chart.png b/docs/_static/images/gcc-chart.png
new file mode 100644
index 000000000..f6890c62a
Binary files /dev/null and b/docs/_static/images/gcc-chart.png differ
diff --git a/docs/_static/images/gcc-comparison-1.png b/docs/_static/images/gcc-comparison-1.png
new file mode 100644
index 000000000..baab5f81b
Binary files /dev/null and b/docs/_static/images/gcc-comparison-1.png differ
diff --git a/docs/_static/images/gcc-comparison-2.png b/docs/_static/images/gcc-comparison-2.png
new file mode 100644
index 000000000..213fffa4e
Binary files /dev/null and b/docs/_static/images/gcc-comparison-2.png differ
diff --git a/docs/_static/images/gcc-meetup-1.png b/docs/_static/images/gcc-meetup-1.png
new file mode 100644
index 000000000..cf205e04d
Binary files /dev/null and b/docs/_static/images/gcc-meetup-1.png differ
diff --git a/docs/_static/images/gcc-meetup-2.png b/docs/_static/images/gcc-meetup-2.png
new file mode 100644
index 000000000..cf5d6208c
Binary files /dev/null and b/docs/_static/images/gcc-meetup-2.png differ
diff --git a/docs/_static/images/gcc-table.png b/docs/_static/images/gcc-table.png
new file mode 100644
index 000000000..2a91e70a5
Binary files /dev/null and b/docs/_static/images/gcc-table.png differ
diff --git a/docs/_static/js/rst_links_fix.js b/docs/_static/js/rst_links_fix.js
index 26bcc2d2a..9fab7c8e2 100644
--- a/docs/_static/js/rst_links_fix.js
+++ b/docs/_static/js/rst_links_fix.js
@@ -1,4 +1,4 @@
-window.onload = function() {
-    $('a[href^="./"][href$=".md"]').attr('href', (i, val) => { return val.replace('.md', '.html'); });  /* Replace '.md' with '.html' in all internal links like './[Something].md' */
-    $('a[href^="./"][href$=".rst"]').attr('href', (i, val) => { return val.replace('.rst', '.html'); });  /* Replace '.rst' with '.html' in all internal links like './[Something].rst' */
-}
+$(function() {
+    $('a[href^="./"][href*=".rst"]').attr('href', (i, val) => { return val.replace('.rst', '.html'); });  /* Replace '.rst' with '.html' in all internal links like './[Something].rst[#anchor]' */
+    $('.wy-nav-content').each(function () { this.style.setProperty('max-width', 'none', 'important'); });
+});
diff --git a/docs/conf.py b/docs/conf.py
index 6a96a3b7b..5c3a3bb07 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -19,14 +19,13 @@
 #
 import os
 import sys
+import sphinx
+from sphinx.errors import VersionRequirementError
 
 curr_path = os.path.dirname(os.path.realpath(__file__))
 libpath = os.path.join(curr_path, '../python-package/')
 sys.path.insert(0, libpath)
 
-from recommonmark.parser import CommonMarkParser
-from recommonmark.transform import AutoStructify
-
 # -- mock out modules
 from unittest.mock import Mock
 MOCK_MODULES = [
@@ -42,8 +41,10 @@ for mod_name in MOCK_MODULES:
 os.environ['LIGHTGBM_BUILD_DOC'] = '1'
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
+needs_sphinx = '1.3'  # Due to sphinx.ext.napoleon
+if needs_sphinx > sphinx.__version__:
+    message = 'This project needs at least Sphinx v%s' % needs_sphinx
+    raise VersionRequirementError(message)
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -60,10 +61,7 @@ templates_path = ['_templates']
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_parsers = {
-    '.md': CommonMarkParser,
-}
-source_suffix = ['.rst', '.md']
+# source_suffix = ['.rst', '.md']
 
 # The master toctree document.
 master_doc = 'index'
@@ -151,20 +149,20 @@ latex_elements = {
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'LightGBM.tex', 'LightGBM Documentation',
-     'Microsoft Corporation', 'manual'),
-]
+# latex_documents = [
+#    (master_doc, 'LightGBM.tex', 'LightGBM Documentation',
+#     'Microsoft Corporation', 'manual'),
+# ]
 
 
 # -- Options for manual page output ---------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'lightgbm', 'LightGBM Documentation',
-     [author], 1)
-]
+# man_pages = [
+#     (master_doc, 'lightgbm', 'LightGBM Documentation',
+#      [author], 1)
+# ]
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -172,19 +170,12 @@ man_pages = [
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'LightGBM', 'LightGBM Documentation',
-     author, 'LightGBM', 'One line description of project.',
-     'Miscellaneous'),
-]
+# texinfo_documents = [
+#     (master_doc, 'LightGBM', 'LightGBM Documentation',
+#      author, 'LightGBM', 'One line description of project.',
+#      'Miscellaneous'),
+# ]
 
 
-# https://recommonmark.readthedocs.io/en/latest/
-github_doc_root = 'https://github.com/Microsoft/LightGBM/tree/master/docs/'
 def setup(app):
-    app.add_config_value('recommonmark_config', {
-            'url_resolver': lambda url: github_doc_root + url,
-            'auto_toc_tree_section': 'Contents',
-            }, True)
-    app.add_transform(AutoStructify)
     app.add_javascript("js/rst_links_fix.js")
diff --git a/docs/gcc-Tips.rst b/docs/gcc-Tips.rst
new file mode 100644
index 000000000..cc28e77ec
--- /dev/null
+++ b/docs/gcc-Tips.rst
@@ -0,0 +1,51 @@
+Recommendations When Using gcc
+==============================
+
+It is recommended to use ``-O3 -mtune=native`` to achieve maximum speed during LightGBM training.
+
+Using Intel Ivy Bridge CPU on 1M x 1K Bosch dataset, the performance increases as follow:
+
++-------------------------------------+---------------------+
+| Compilation Flag                    | Performance Index   |
++=====================================+=====================+
+| ``-O2 -mtune=core2``                | 100.00%             |
++-------------------------------------+---------------------+
+| ``-O2 -mtune=native``               | 100.90%             |
++-------------------------------------+---------------------+
+| ``-O3 -mtune=native``               | 102.78%             |
++-------------------------------------+---------------------+
+| ``-O3 -ffast-math -mtune=native``   | 100.64%             |
++-------------------------------------+---------------------+
+
+You can find more details on the experimentation below:
+
+-  `Laurae++/Benchmarks <https://sites.google.com/view/lauraepp/new-benchmarks/old-benchmarks>`__
+
+-  `Laurae2/gbt\_benchmarks <https://github.com/Laurae2/gbt_benchmarks>`__
+
+-  `Laurae's Benchmark Master Data (Interactive) <https://public.tableau.com/views/gbt_benchmarks/Master-Data?:showVizHome=no>`__
+
+-  `Kaggle Paris Meetup #12 Slides <https://drive.google.com/file/d/0B6qJBmoIxFe0ZHNCOXdoRWMxUm8/view>`__
+
+Some explanatory pictures:
+
+.. image:: ./_static/images/gcc-table.png
+   :align: center
+
+.. image:: ./_static/images/gcc-bars.png
+   :align: center
+
+.. image:: ./_static/images/gcc-chart.png
+   :align: center
+
+.. image:: ./_static/images/gcc-comparison-1.png
+   :align: center
+
+.. image:: ./_static/images/gcc-comparison-2.png
+   :align: center
+
+.. image:: ./_static/images/gcc-meetup-1.png
+   :align: center
+
+.. image:: ./_static/images/gcc-meetup-2.png
+   :align: center
diff --git a/docs/gcc-tips.Rmd b/docs/gcc-tips.Rmd
deleted file mode 100644
index a47e8adf6..000000000
--- a/docs/gcc-tips.Rmd
+++ /dev/null
@@ -1,35 +0,0 @@
-# Recommendations when using gcc
-
-It is recommended to use `-O3 -mtune=native` to achieve maximum speed during LightGBM training.
-
-Using Intel Ivy Bridge CPU on 1M x 1K Bosch dataset, the performance increases as follow:
-
-| Compilation Flag | Performance Index |
-| --- | ---: |
-| `-O2 -mtune=core2` | 100.00% |
-| `-O2 -mtune=native` | 100.90% |
-| `-O3 -mtune=native` | 102.78% |
-| `-O3 -ffast-math -mtune=native` | 100.64% |
-
-You can find more details on the experimentation below:
-
-* [Laurae++/Benchmarks](https://sites.google.com/view/lauraepp/benchmarks)
-* [Laurae2/gbt_benchmarks](https://github.com/Laurae2/gbt_benchmarks)
-* [Laurae's Benchmark Master Data (Interactive)](https://public.tableau.com/views/gbt_benchmarks/Master-Data?:showVizHome=no)
-* [Kaggle Paris Meetup #12 Slides](https://drive.google.com/file/d/0B6qJBmoIxFe0ZHNCOXdoRWMxUm8/view)
-
-Some pictures below:
-
-![gcc table](https://cloud.githubusercontent.com/assets/9083669/26027337/c376e22e-380c-11e7-91bc-fe0a333c03e9.png)
-
-![gcc bars](https://cloud.githubusercontent.com/assets/9083669/26027338/d1caebcc-380c-11e7-864e-d704b39f1e63.png)
-
-![gcc chart](https://cloud.githubusercontent.com/assets/9083669/26027353/e1bdb866-380c-11e7-97b5-22c7eac349b2.png)
-
-![gcc comparison 1](https://cloud.githubusercontent.com/assets/9083669/26027401/c31f2f74-380d-11e7-857a-f5119791bed7.png)
-
-![gcc comparison 2](https://cloud.githubusercontent.com/assets/9083669/26027486/d7d7e72a-380e-11e7-86c3-ccbbf42a9c55.png)
-
-![gcc meetup 1](https://cloud.githubusercontent.com/assets/9083669/26027427/21b38f44-380e-11e7-9c95-05437782dd46.png)
-
-![gcc meetup 2](https://cloud.githubusercontent.com/assets/9083669/26027433/362be250-380e-11e7-8982-76ac167bcd3e.png)
diff --git a/docs/index.rst b/docs/index.rst
index adbd6e9d8..d0cbe37a6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,17 +12,17 @@ Welcome to LightGBM's documentation!
 
    Installation Guide <Installation-Guide>
    Quick Start <Quick-Start>
-   Python Quick Start <Python-intro>
+   Python Quick Start <Python-Intro>
    Features <Features>
    Experiments <Experiments>
    Parameters <Parameters>
-   Parameters Tuning <Parameters-tuning>
+   Parameters Tuning <Parameters-Tuning>
    Python API <Python-API>
    Parallel Learning Guide <Parallel-Learning-Guide>
    GPU Tutorial <GPU-Tutorial>
-   Advanced Topics <Advanced-Topic>
+   Advanced Topics <Advanced-Topics>
    FAQ <FAQ>
-   Development Guide <development>
+   Development Guide <Development-Guide>
 
 Indices and Tables
 ==================
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index b0bd38902..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-sphinx<=1.5.6
-sphinxcontrib-napoleon
-sphinx_rtd_theme
-recommonmark
diff --git a/python-package/README.rst b/python-package/README.rst
index ddae4e81e..c1a6bb693 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -88,7 +88,7 @@ Refer to the walk through examples in `Python guide folder <https://github.com/M
 Troubleshooting
 ---------------
 
-Refer to `FAQ <https://github.com/Microsoft/LightGBM/tree/master/docs/FAQ.md>`_.
+Refer to `FAQ <https://github.com/Microsoft/LightGBM/tree/master/docs/FAQ.rst>`_.
 
 Developments
 ------------