Beautifying doc/code changes

2016-11-10 17:01:06 +01:00 · 2016-11-10 17:01:06 +01:00 · b326a9af94
--- a/bindings/python/cntk/io/init.py
+++ b/bindings/python/cntk/io/init.py
@ -119,7 +119,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
             the next minibatch. Must be > 0.
            minibatch_size_in_sequences (`int`, defaults to `None`): number of
             samples to retrieve for the next minibatch. Must be > 0.
-            input_map (`dict`): mapping of :class:`cntk.ops.variabls.Variable`
+            input_map (`dict`): mapping of :class:`~cntk.ops.variabls.Variable`
             to :class:`StreamInformation` which will be used to convert the
             returned data.
            device (`DeviceDescriptor`, defaults to `None`): CNTK DeviceDescriptor
@ -127,7 +127,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
        Returns:
            A mapping of :class:`StramInformation` to :class:`MinibatchData` if
            ``input_map`` was not specified. Otherwise, the returned value will
-            be a mapping of :class:`cntk.ops.variabls.Variable` to class:`MinibatchData`.
+            be a mapping of :class:`~cntk.ops.variabls.Variable` to class:`MinibatchData`.
        '''
        if device is None:
            device = use_default_device()
@ -161,7 +161,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
        Gets the checkpoint state of the MinibatchSource.

        Returns:
-            :class:`cntk_py.Dictionary`
+            :class:`~cntk_py.Dictionary`
        '''
        return super(MinibatchSource, self).get_checkpoint_state()

@ -170,7 +170,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
        Restores the MinibatchSource state from the specified checkpoint.

        Args:
-            checkpoint (:class:`cntk_py.Dictionary`): checkpoint to restore from
+            checkpoint (:class:`~cntk_py.Dictionary`): checkpoint to restore from
        '''
        super(MinibatchSource, self).restore_from_checkpoint(checkpoint)

@ -181,7 +181,7 @@ def _py_dict_to_cntk_dict(py_dict):
    Args:
        py_dict (`dict`): a dictionary to be converted.
    Returns:
-        :class:`cntk_py.Dictionary`
+        :class:`~cntk_py.Dictionary`
    '''
    res = cntk_py.Dictionary()
    for k, v in py_dict.items():
@ -244,10 +244,10 @@ class ReaderConfig(dict):
        '''
        Creates an instance of :class:`MinibatchSource` from this
        instance, which can be used to feed data into the `eval()` methods of
-        the graph nodes or the `train_minibatch()` of :class:`cntk.trainer.Trainer`.
+        the graph nodes or the `train_minibatch()` of :class:`~cntk.trainer.Trainer`.

        Args:
-            distributed_communicator (:class:`cntk.distributed.communicator`): distributed communicator
+            distributed_communicator (:class:`~cntk.distributed.communicator`): distributed communicator
        
        Returns:
            instance of :class:`MinibatchSource`
@ -481,7 +481,7 @@ def text_format_minibatch_source(path, stream_configs, epoch_size=INFINITELY_REP
        epoch_size (`int`, optional): size of an epoch. In case of 0 the size
         of the training set will be taken. Default is max of 64bit.
        randomize (`bool`, optional): whether to randomize the contents of data file.
-        distributed_communicator (:class:`cntk.distributed.communicator`): optional distributed communicator
+        distributed_communicator (:class:`~cntk.distributed.communicator`): optional distributed communicator

    Returns:
        :class:`MinibatchSource`
--- a/bindings/python/cntk/learner.py
+++ b/bindings/python/cntk/learner.py
@ -63,7 +63,7 @@ class Learner(cntk_py.Learner):
        Update the parameters associated with this learner.

        Args:
-            gradient_values (`dict`): maps :class:`cntk.variables.Parameter` to
+            gradient_values (`dict`): maps :class:`~cntk.variables.Parameter` to
             a NumPy array containing the first order gradient values for the
             Parameter w.r.t. the training objective.
            training_sample_count (`int`): training sample count
@ -102,9 +102,9 @@ class Learner(cntk_py.Learner):
        The learning rate.

        Args:
-            minibatch_size (`int`): minibatch size to re-scaled
+            minibatch_size (``int``): minibatch size to re-scaled
            the learning rate to the per-sample value (in case when the schedule 
-            was build with unit=UnitType.minibatch).
+            was build with ``unit=UnitType.minibatch``).
        '''
        return super(Learner, self).learning_rate(minibatch_size)

@ -132,16 +132,16 @@ def training_parameter_schedule(schedule, epoch_size=1, unit=UnitType.sample):
        (0.1, 0.1, 0.01, 0.01, 0.001, 0.001)

    Args:
-        schedule (`float` or `list`): if `float`, is the parameter schedule to be used
+        schedule (``float`` or ``list``): if ``float``, is the parameter schedule to be used
         for all samples. In case of list, the elements are used as the
         values for ``epoch_size`` samples. If list contains pair, the second element is
         used as a value for (``epoch_size`` x first element) samples
        epoch_size (`int`): number of samples as a scheduling unit. Parameters in
-         the schedule change their values every 'epoch_size' samples.
-        unit (:class:`cntk.ops.functions.UnitType`): one of two
+         the schedule change their values every ``epoch_size`` samples.
+        unit (:class:`UnitType`): one of two

-          * 'sample': the returned schedule contains per-sample values (default)
-          * 'minibatch': the returned schedule contains per-minibatch values.
+          * ``sample``: the returned schedule contains per-sample values (default)
+          * ``minibatch``: the returned schedule contains per-minibatch values.

    Returns:
        training parameter schedule
@ -176,11 +176,11 @@ def learning_rate_schedule(lr, epoch_size=1, unit=UnitType.sample):
    :func:`training_parameter_schedule`).

    Args:
-        lr (`float` or `list`): see parameter ``schedule`` in 
+        lr (``float`` or ``list``): see parameter ``schedule`` in 
         :func:`training_parameter_schedule`.
-        epoch_size (`int`): see parameter ``epoch_size`` in 
+        epoch_size (``int``): see parameter ``epoch_size`` in 
         :func:`training_parameter_schedule`.
-        unit (:class:`cntk.ops.functions.UnitType`): see parameter 
+        unit (:class:`UnitType`): see parameter 
         ``unit`` in :func:`training_parameter_schedule`.

    Returns:
@ -195,11 +195,11 @@ def momentum_schedule(momentum, epoch_size=1, unit=UnitType.sample):
    :func:`training_parameter_schedule`).

    Args:
-        momentum (`float` or `list`): see parameter ``schedule`` in 
+        momentum (``float`` or ``list``): see parameter ``schedule`` in 
         :func:`training_parameter_schedule`.
-        epoch_size (`int`): see parameter ``epoch_size`` in 
+        epoch_size (``int``): see parameter ``epoch_size`` in 
         :func:`training_parameter_schedule`.
-        unit (:class:`cntk.ops.functions.UnitType`): see parameter 
+        unit (:class:`UnitType`): see parameter 
         ``unit`` in :func:`training_parameter_schedule`.

    If you want to provide momentum values in a sample/minibatch
@ -223,11 +223,11 @@ def momentum_schedule(momentum, epoch_size=1, unit=UnitType.sample):
        (0.99, 0.99, 0.88, 0.88, 0.77)

    Args:
-        momentum (`float` or `list`): see parameter ``schedule`` in 
+        momentum (``float`` or ``list``): see parameter ``schedule`` in 
         :func:`training_parameter_schedule`.
-        epoch_size (`int`): see parameter ``epoch_size`` in 
+        epoch_size (``int``): see parameter ``epoch_size`` in 
         :func:`training_parameter_schedule`.
-        unit (:class:`cntk.ops.functions.UnitType`): see parameter 
+        unit (:class:`UnitType`): see parameter 
         ``unit`` in :func:`training_parameter_schedule`.

    Returns:
@ -242,11 +242,11 @@ def momentum_as_time_constant_schedule(momentum, epoch_size=1):
    semantics as :func:`training_parameter_schedule`).

    Args:
-        momentum (`float` or `list`): see parameter ``schedule`` in 
+        momentum (``float`` or ``list``): see parameter ``schedule`` in 
         :func:`training_parameter_schedule`.
-        epoch_size (`int`): see parameter ``epoch_size`` in 
+        epoch_size (``int``): see parameter ``epoch_size`` in 
         :func:`training_parameter_schedule`.
-        unit (:class:`cntk.ops.functions.UnitType`): see parameter 
+        unit (:class:`UnitType`): see parameter 
         ``unit`` in :func:`training_parameter_schedule`.

    CNTK specifies momentum in a minibatch-size agnostic way as the time
@ -267,9 +267,9 @@ def momentum_as_time_constant_schedule(momentum, epoch_size=1):
        >>> m = momentum_as_time_constant_schedule([1100, 1500], 1000)

    Args:
-        momentum (`float` or `list`): see parameter ``schedule`` in 
+        momentum (``float`` or ``list``): see parameter ``schedule`` in 
         :func:`training_parameter_schedule`.
-        epoch_size (`int`): see parameter ``epoch_size`` in 
+        epoch_size (``int``): see parameter ``epoch_size`` in 
         :func:`training_parameter_schedule`.

    Returns:
@ -295,27 +295,33 @@ def sgd(parameters, lr,
        gaussian_noise_injection_std_dev=0.0, gradient_clipping_threshold_per_sample=1E10,
        gradient_clipping_with_truncation=True):
    '''
-    Creates an SGD learner instance to learn the parameters.
+    Creates an SGD learner instance to learn the parameters. See [1] for more
+    information on how to set the parameters.

    Args:
        parameters (`list` of parameters): list of network parameters to tune.
         These can be obtained by the '.parameters()' method of the root
         operator.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
+        lr (``float``, ``list`` or output of :func:`learning_rate_schedule`): learning rate 
         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
+
+    See also:
+        [1] L. Bottou. `Stochastic Gradient Descent Tricks
+        <http://research.microsoft.com/pubs/192769/tricks-2012.pdf>`_. Neural
+        Networks: Tricks of the Trade: Springer, 2012. 
    '''
    lr = learning_rate_schedule(lr)
    gaussian_noise_injection_std_dev = training_parameter_schedule(gaussian_noise_injection_std_dev)
@ -340,26 +346,25 @@ def momentum_sgd(parameters, lr, momentum,
    Args:
        parameters (list of parameters): list of network parameters to tune.
         These can be obtained by the root operator's ``parameters``.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
+        lr (``float``, `list````` or output of :func:`learning_rate_schedule`): learning rate 
         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        momentum (`float`, `list` or output of `:func:momentum_schedule` or 
-         `:func:momentum_as_time_constant_schedule`): momentum schedule. When the argument 
-         value is a `float` or a `list`, momentum is converted to a per-sample schedule by 
-         invoking `:func:momentum_schedule`. Refer to 
-         https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        momentum (``float``, ``list`` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         value is a ``float`` or a ``list``, momentum is converted to a per-sample schedule by 
+         invoking :func:`momentum_schedule`. Refer to the `wiki
+         <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
    '''
    lr = learning_rate_schedule(lr)
    momentum = momentum_schedule(momentum)
@ -381,31 +386,42 @@ def nesterov(parameters, lr, momentum,
        gaussian_noise_injection_std_dev=0.0, gradient_clipping_threshold_per_sample=1E10,
        gradient_clipping_with_truncation=True):
    '''
-    Creates a Nesterov SGD learner instance to learn the parameters.
+    Creates a Nesterov SGD learner instance to learn the parameters. This was
+    originally proposed by Nesterov [1] in 1983 and then proved to work well in
+    a deep learning context by Sutskever, et al. [2].

    Args:
        parameters (list of parameters): list of network parameters to tune.
         These can be obtained by the root operator's ``parameters``.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        momentum (`float`, `list` or output of `:func:momentum_schedule` or 
-         `:func:momentum_as_time_constant_schedule`): momentum schedule. When the argument 
-         value is a `float` or a `list`, momentum is converted to a per-sample schedule by 
-         invoking `:func:momentum_schedule`. Refer to 
-         https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+        lr (``float``, ``list`` or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a ``float`` or a ``list``, lr is 
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        momentum (``float``, ``list`` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         value is a ``float`` or a ``list``, momentum is converted to a per-sample schedule by 
+         invoking :func:`momentum_schedule`. Refer to the `wiki
+         <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the
+        :class:`~cntk.trainer.Trainer`.
+
+    See also:
+        [1] Y. Nesterov. A Method of Solving a Convex Programming Problem with Convergence Rate O(1/ sqrt(k)). Soviet Mathematics Doklady, 1983.
+
+        [2] I. Sutskever, J. Martens, G. Dahl, and G. Hinton. `On the
+        Importance of Initialization and Momentum in Deep Learning
+        <http://www.cs.toronto.edu/~fritz/absps/momentum.pdf>`_.  Proceedings
+        of the 30th International Conference on Machine Learning, 2013.
+            
    '''
    lr = learning_rate_schedule(lr)
    momentum = momentum_schedule(momentum)
@ -427,27 +443,34 @@ def adagrad(parameters, lr, need_ave_multiplier=True,
        gaussian_noise_injection_std_dev=0.0, gradient_clipping_threshold_per_sample=1E10,
        gradient_clipping_with_truncation=True):
    '''
-    Creates an AdaGrad learner instance to learn the parameters.
+    Creates an AdaGrad learner instance to learn the parameters. See [1] for
+    more information.

    Args:
        parameters (list of parameters): list of network parameters to tune.
         These can be obtained by the root operator's ``parameters``.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
+        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        need_ave_multiplier ('bool', default):
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        need_ave_multiplier (``bool``, default):
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
+
+    See also:
+        [1]  J. Duchi, E. Hazan, and Y. Singer. `Adaptive Subgradient Methods
+        for Online Learning and Stochastic Optimization
+        <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_. The Journal of
+        Machine Learning Research, 2011.
    '''
    lr = learning_rate_schedule(lr)
    gaussian_noise_injection_std_dev = training_parameter_schedule(gaussian_noise_injection_std_dev)
@ -471,35 +494,39 @@ def adam_sgd(parameters, lr, momentum,
        gaussian_noise_injection_std_dev=0.0, gradient_clipping_threshold_per_sample=1E10,
        gradient_clipping_with_truncation=True):
    '''
-    Creates an Adam learner instance to learn the parameters.
+    Creates an Adam learner instance to learn the parameters. See [1] for more
+    information.

    Args:
        parameters (list of parameters): list of network parameters to tune.
         These can be obtained by the root operator's ``parameters``.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
+        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        momentum (`float`, `list` or output of `:func:momentum_schedule` or 
-         `:func:momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        momentum (`float`, `list` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
         value is a `float` or a `list`, momentum is converted to a per-sample schedule by 
-         invoking `:func:momentum_schedule`. Refer to 
-         https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits
-        variance_momentum (`float`, `list` or output of `:func:momentum_schedule` or 
-         `:func:momentum_as_time_constant_schedule`): variance momentum schedule. When the argument 
+         invoking :func:`momentum_schedule`. Refer to the `wiki
+         <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
+        variance_momentum (`float`, `list` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): variance momentum schedule. When the argument 
         value is a `float` or a `list`, variance momentum is converted to a per-sample schedule by 
-         invoking `:func:momentum_schedule`. Defaults to momentum_as_time_constant_schedule(720000).
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+         invoking :func:`momentum_schedule`. Defaults to momentum_as_time_constant_schedule(720000).
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
+
+    See also:
+        [1] D. Kingma, J. Ba. `Adam: A Method for Stochastic Optimization
+        <http://arxiv.org/abs/1412.6980>`_. International Conference for
+        Learning Representations, 2015. 
    '''
    if not low_memory:
        raise NotImplementedError('adam: low_memory=True currently required')
@ -532,27 +559,27 @@ def rmsprop(parameters, lr,
    Args:
        parameters (list of parameters): list of network parameters to tune.
         These can be obtained by the root operator's ``parameters``.
-        lr ('float', `list` or output of `:func:learning_rate_schedule`): learning rate 
+        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
         schedule. When the argument value is a `float` or a `list`, lr is 
-         converted to a per-sample schedule by invoking `:func:learning_rate_schedule`.
-        gamma ('float'):
-        inc ('float'):
-        dec ('float'):
-        max ('float'):
-        min ('float'):
-        need_ave_multiplier ('bool', default):
-        l1_regularization_weight ('float', optional): the L1 regularization weight per sample,
+         converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
+        gamma (``float``):
+        inc (``float``):
+        dec (``float``):
+        max (``float``):
+        min (``float``):
+        need_ave_multiplier (``bool``, default):
+        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
         defaults to 0.0
-        l2_regularization_weight ('float', optional): the L2 regularization weight per sample,
+        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
         defaults to 0.0
-        gaussian_noise_injection_std_dev ('float', optional): the standard deviation
+        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
         of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample ('float', optional): clipping threshold
+        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
         per sample, defaults to infinity
-        gradient_clipping_with_truncation ('bool', default `True`): gradient clipping
+        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping

    Returns:
-        Instance of a :class:`cntk.learner.Learner` that can be passed to the :class:`cntk.trainer.Trainer`
+        Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
    '''
    lr = learning_rate_schedule(lr)
    gaussian_noise_injection_std_dev = training_parameter_schedule(gaussian_noise_injection_std_dev)
--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@ -5,7 +5,7 @@ from enum import Enum, unique
@unique
 class CloneMethod(Enum):
    '''
-    Describes different ways how :class:`cntk.ops.functions.Function.forward`
+    Describes different ways how :class:`~cntk.ops.functions.Function.forward`
    works.
    '''

@ -123,7 +123,7 @@ class Function(cntk_py.Function):
        substitutions requested are applied in the cloned Function instance.

        Args:
-            method (:class:`cntk.ops.functions.CloneMethod`): one of
+            method (:class:`CloneMethod`): one of

             * 'clone': the returned function gets its own copy of parameters (default)
             * 'share': the returned function shares its parameters with this function
@ -133,7 +133,7 @@ class Function(cntk_py.Function):
             function to variables in the cloned function

        Returns:
-            :class:`Function`: the cloned Function
+            :class:`~cntk.ops.functions.Function`: the cloned Function
        '''
        if not isinstance(method, CloneMethod):
            raise ValueError('clone method "%s" is not supported' %
@ -149,7 +149,7 @@ class Function(cntk_py.Function):
    @typemap
    def constants(self):
        '''
-        List of all `Constant` variables of this :class:`Function`
+        List of all `Constant` variables of this :class:`~cntk.ops.functions.Function`
        '''
        return super(Function, self).constants()

@ -171,8 +171,8 @@ class Function(cntk_py.Function):
             be used as a list of bools, denoting whether a sequence is a new
             one (`True`) or a continuation of the previous one (`False`).
             Data should be either NumPy arrays or a
-             :class:`cntk.io.MinibatchData` instance.
-            device (:class:`cntk.device.DeviceDescriptor`): the device descriptor that
+             :class:`~cntk.io.MinibatchData` instance.
+            device (:class:`~cntk.device.DeviceDescriptor`): the device descriptor that
             contains the type and id of the device on which the computation is
             to be performed.

@ -222,14 +222,14 @@ class Function(cntk_py.Function):
             be used as a list of bools, denoting whether a sequence is a new
             one (`True`) or a continuation of the previous one (`False`).
             Data should be either NumPy arrays or a
-             :class:`cntk.io.MinibatchData` instance.
+             :class:`~cntk.io.MinibatchData` instance.
            outputs (iterable): outputs to fetch values for.
            keep_for_backward (`set`, default `None`): the subset of the
             Function's output variables for which gradients shall be calculated
             in a subsequent backward call. If `None`, the returned state will
             be `None` and a subsequent call to :func:`backward` will not be
             possible.
-            device (:class:`cntk.device.DeviceDescriptor`, default `None`): the device
+            device (:class:`~cntk.device.DeviceDescriptor`, default `None`): the device
             descriptor that contains the type and id of the device on which the
             computation is. If `None`, the default device is used.

@ -371,7 +371,7 @@ class Function(cntk_py.Function):
        specified substitution.

        Args:
-            substitution (:class:`cntk.ops.variables.Variable`): the variable
+            substitution (:class:`~cntk.ops.variables.Variable`): the variable
             that will replace the placeholder 

        Returns:
--- a/bindings/python/cntk/trainer.py
+++ b/bindings/python/cntk/trainer.py
@ -23,11 +23,11 @@ class Trainer(cntk_py.Trainer):
    using computed gradients.

    Args:
-       model (:class:`cntk.ops.functions.Function`): root node of the function to train
-       loss_function (:class:`cntk.ops.functions.Function`): loss function 
-       eval_function (:class:`cntk.ops.functions.Function`): evaluation function
+       model (:class:`~cntk.ops.functions.Function`): root node of the function to train
+       loss_function (:class:`~cntk.ops.functions.Function`): loss function 
+       eval_function (:class:`~cntk.ops.functions.Function`): evaluation function
       parameter_learners (`list`): list of learners from :mod:`cntk.learner`
-       distributed_trainer (:class:`cntk.distributed.distributed_trainer`): distributed trainer
+       distributed_trainer (:class:`~cntk.distributed.distributed_trainer`): distributed trainer
    '''
    def __init__(self, model, loss_function, eval_function, parameter_learners, distributed_trainer=None):
        # TODO sanitizing should be removed once Swig's typemaps are in place
@ -62,9 +62,9 @@ class Trainer(cntk_py.Trainer):
             be used as a list of bools, denoting whether a sequence is a new
             one (`True`) or a continuation of the previous one (`False`).
             Data should be either NumPy arrays or a
-             :class:`cntk.io.MinibatchData` instance.
+             :class:`~cntk.io.MinibatchData` instance.
            outputs (iterable): outputs to fetch values for.
-            device (:class:`cntk.device.DeviceDescriptor`): the device descriptor that
+            device (:class:`~cntk.device.DeviceDescriptor`): the device descriptor that
             contains the type and id of the device on which the computation is
             to be performed.

@ -113,8 +113,8 @@ class Trainer(cntk_py.Trainer):
             be used as a list of bools, denoting whether a sequence is a new
             one (`True`) or a continuation of the previous one (`False`).
             Data should be either NumPy arrays or a
-             :class:`cntk.io.MinibatchData` instance.
-            device (:class:`cntk.device.DeviceDescriptor`): the device descriptor that
+             :class:`~cntk.io.MinibatchData` instance.
+            device (:class:`~cntk.device.DeviceDescriptor`): the device descriptor that
             contains the type and id of the device on which the computation is
             to be performed.
        Returns:
--- a/bindings/python/examples/MNIST/SimpleMNIST.py
+++ b/bindings/python/examples/MNIST/SimpleMNIST.py
@ -45,11 +45,11 @@ def simple_mnist(debug_output=False):

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), input)
-    netout = fully_connected_classifier_net(
+    z = fully_connected_classifier_net(
        scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu)

-    ce = cross_entropy_with_softmax(netout, label)
-    pe = classification_error(netout, label)
+    ce = cross_entropy_with_softmax(z, label)
+    pe = classification_error(z, label)

    try:
        rel_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
@ -67,7 +67,7 @@ def simple_mnist(debug_output=False):
    }

    # Instantiate the trainer object to drive the model training
-    trainer = Trainer(netout, ce, pe, sgd(netout.parameters, lr=0.003125))
+    trainer = Trainer(z, ce, pe, sgd(z.parameters, lr=1./320))

    # Get minibatches of images to train with and perform model training
    minibatch_size = 64