diff --git a/bindings/python/cntk/axis.py b/bindings/python/cntk/axis.py
index 91e224cb1..3fc85e37b 100644
--- a/bindings/python/cntk/axis.py
+++ b/bindings/python/cntk/axis.py
@@ -41,7 +41,7 @@ class Axis(cntk_py.Axis):
         Returns True if the axis is of type static and False otherwise
 
         Returns:
-            `bool`: True if this axis is of type static and False otherwise
+            bool: True if this axis is of type static and False otherwise
         '''
         return super(Axis, self).is_static_axis()
 
@@ -51,7 +51,7 @@ class Axis(cntk_py.Axis):
         Returns the name of this axis.
 
         Returns:
-            `str`: the name of this axis.
+            str: the name of this axis.
         '''
         return super(Axis, self).name()
 
@@ -60,7 +60,7 @@ class Axis(cntk_py.Axis):
         Returns the integer with which the static axis is defined. For example, 0 = first axis, 1 = second axis, etc.
 
         Args:
-            checked (`bool`): if True then this function will throw an exception if the axis is not static.
+            checked (bool): if True then this function will throw an exception if the axis is not static.
 
         Returns:
             `int`: the number with which the static axis is defined.
@@ -107,7 +107,7 @@ class Axis(cntk_py.Axis):
         Creates an Axis object representing a new unique dynamic axis.
 
         Args:
-            name (`str`): name of the dynmic axis
+            name (str): name of the dynmic axis
 
         Returns:
             :class:`Axis`: new unique dynamic axis
diff --git a/bindings/python/cntk/device.py b/bindings/python/cntk/device.py
index 38c85f907..8718cfddb 100644
--- a/bindings/python/cntk/device.py
+++ b/bindings/python/cntk/device.py
@@ -35,7 +35,7 @@ def all_devices():
     Returns a device descriptor list with all the available devices
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor` list: all device descriptors
+        :class:`~cntk.device.DeviceDescriptor` list: all device descriptors
     '''
     return cntk_py.DeviceDescriptor.all_devices()
 
@@ -44,7 +44,7 @@ def best():
     Returns a device descriptor with the best configuration.
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor`: Best device descriptor
+        :class:`~cntk.device.DeviceDescriptor`: Best device descriptor
     '''
     return cntk_py.DeviceDescriptor.best_device()
 
@@ -53,7 +53,7 @@ def cpu():
     Returns CPU device descriptor
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor`: CPU device descriptor
+        :class:`~cntk.device.DeviceDescriptor`: CPU device descriptor
     '''
     return cntk_py.DeviceDescriptor.cpu_device()
 
@@ -62,7 +62,7 @@ def default():
     Returns default device
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor`: Default device descriptor
+        :class:`~cntk.device.DeviceDescriptor`: Default device descriptor
     '''
     return cntk_py.DeviceDescriptor.default_device()
 
@@ -71,7 +71,7 @@ def gpu(device_id):
     Returns GPU device
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor`: GPU device descriptor
+        :class:`~cntk.device.DeviceDescriptor`: GPU device descriptor
     '''
     return cntk_py.DeviceDescriptor.gpu_device(device_id)
 
@@ -89,9 +89,9 @@ def set_default_device(new_default_device):
     Set new device descriptor as default
 
     Args:
-        new_default_device (:class:`cntk.device.DeviceDescriptor`): new device descriptor
+        new_default_device (:class:`~cntk.device.DeviceDescriptor`): new device descriptor
 
     Returns:
-        :class:`cntk.device.DeviceDescriptor`: id
+        :class:`~cntk.device.DeviceDescriptor`: id
     '''
     return cntk_py.DeviceDescriptor.set_default_device(new_default_device)
diff --git a/bindings/python/cntk/io/__init__.py b/bindings/python/cntk/io/__init__.py
index 79585d7bc..6f9d6f955 100644
--- a/bindings/python/cntk/io/__init__.py
+++ b/bindings/python/cntk/io/__init__.py
@@ -57,9 +57,9 @@ class MinibatchSource(cntk_py.MinibatchSource):
     Parent class of all minibatch sources. For most cases you will need the
     helper functions :func:`text_format_minibatch_source` or
     :func:`minibatch_source`.
-    A `MinibatchSource` can be indexed by a `StreamInfo`, which will return a
-    `MinibatchData` object that can be passed e.g. to the
-    :func:`cntk.trainer.Trainer.train_minibatch` function.
+    A `MinibatchSource` can be indexed by the stream name, which will return a
+    :class:`MinibatchData` object that can be passed e.g. to the
+    :func:`~cntk.trainer.Trainer.train_minibatch` function.
     '''
 
     def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT, distributed_communicator=None):
@@ -81,7 +81,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
         Describes the stream that this source produces.
 
         Returns:
-            `dict` mapping input names to the stream information
+            dict mapping input names to the stream information
         '''
         return super(MinibatchSource, self).stream_infos()
 
@@ -98,7 +98,7 @@ class MinibatchSource(cntk_py.MinibatchSource):
         Return the :class:`StreamInfo` for the given stream name
 
         Args:
-            name (`str`): stream name to fetch :class:`StreamInfo` for
+            name (str): stream name to fetch :class:`StreamInfo` for
         '''
         return self.stream_info(name)
 
@@ -115,11 +115,11 @@ class MinibatchSource(cntk_py.MinibatchSource):
         when the MinibatchSource has no more data to return.
 
         Args:
-            minibatch_size_in_samples (`int`): number of samples to retrieve for
+            minibatch_size_in_samples (int): number of samples to retrieve for
              the next minibatch. Must be > 0.
-            minibatch_size_in_sequences (`int`, defaults to `None`): number of
+            minibatch_size_in_sequences (int, defaults to `None`): number of
              samples to retrieve for the next minibatch. Must be > 0.
-            input_map (`dict`): mapping of :class:`~cntk.ops.variabls.Variable`
+            input_map (dict): mapping of :class:`~cntk.ops.variabls.Variable`
              to :class:`StreamInformation` which will be used to convert the
              returned data.
             device (`DeviceDescriptor`, defaults to `None`): CNTK DeviceDescriptor
@@ -179,7 +179,7 @@ def _py_dict_to_cntk_dict(py_dict):
     '''
     Converts a Python dictionary into a CNTK Dictionary whose values are CNTK DictionaryValue instances.
     Args:
-        py_dict (`dict`): a dictionary to be converted.
+        py_dict (dict): a dictionary to be converted.
     Returns:
         :class:`~cntk_py.Dictionary`
     '''
@@ -208,7 +208,7 @@ def minibatch_source(config, distributed_communicator):
     '''
     Instantiate the CNTK built-in composite minibatch source which is used to stream data into the network.
     Args:
-        config (`dict`): a dictionary containing all the key-value configuration entries.
+        config (dict): a dictionary containing all the key-value configuration entries.
         distributed_communicator: optional distributed communicator
     Returns:
         :class:`MinibatchSource`
@@ -227,8 +227,8 @@ class ReaderConfig(dict):
     Args:
         deserializers ('list', default is empty): list of deserializers
          (:class:`ImageDeserializer` for now).
-        randomize (`bool`, default True): randomize images before every epoch
-        epoch_size (`int`): epoch size
+        randomize (bool, default True): randomize images before every epoch
+        epoch_size (int): epoch size
     '''
 
     def __init__(self, deserializers=None, randomize=True, epoch_size=INFINITELY_REPEAT):
@@ -270,7 +270,7 @@ class Deserializer(dict):
     ========================== ============
 
     Args:
-        type (`str`): type of the deserializer
+        type (str): type of the deserializer
 
     See also:
         https://github.com/microsoft/cntk/wiki/Understanding-and-Extending-Readers
@@ -288,7 +288,7 @@ class ImageDeserializer(Deserializer):
          <full path to image><tab><numerical label (0-based class id)>
 
     Args:
-        filename (`str`): file name of the map file that associates images to
+        filename (str): file name of the map file that associates images to
          classes
 
     See also:
@@ -322,7 +322,7 @@ class ImageDeserializer(Deserializer):
         of the network with data augmentation.
 
         Args:
-            node (`str` or input node): node or its name
+            node (str or input node): node or its name
             transforms (`list` of transforms): the transforms can be created by
              the static methods `crop`, `scale`, or `mean`.
 
@@ -341,8 +341,8 @@ class ImageDeserializer(Deserializer):
         ground truth of train or test.
 
         Args:
-            node (`str` or input node): node or its name
-            num_classes (`int`): number of classes
+            node (str or input node): node or its name
+            num_classes (int): number of classes
 
         '''
         if not isinstance(node, str):
@@ -355,7 +355,7 @@ class ImageDeserializer(Deserializer):
         Crop transform that can be used to pass to `map_features`
 
         Args:
-            crop_type (`str`, default 'center'): 'center' or 'random'.  'random'
+            crop_type (str, default 'center'): 'center' or 'random'.  'random'
              is usually used during training while 'center' is usually for testing.
              Random cropping is a popular data augmentation technique used to improve
              generalization of the DNN.
@@ -368,12 +368,12 @@ class ImageDeserializer(Deserializer):
              augmentation technique), use colon-delimited values like  cropRatio=0.875:0.466
              which means 224 crop will be taken from images randomly scaled to have
              size in [256, 480] range.
-            jitter_type (`str`, default 'uniRatio'): crop scale jitter type, possible
+            jitter_type (str, default 'uniRatio'): crop scale jitter type, possible
              values are 'None', 'UniRatio'. 'uniRatio' means uniform distributed jitter
              scale between the minimum and maximum cropRatio values.
 
         Returns:
-            `dict` describing the crop transform
+            dict describing the crop transform
         '''
         return dict(type='Crop', cropType=crop_type, cropRatio=ratio,
                 jitterType=jitter_type)
@@ -384,14 +384,14 @@ class ImageDeserializer(Deserializer):
         Scale transform that can be used to pass to `map_features` for data augmentation.
 
         Args:
-            width (`int`): width of the image in pixels
-            height (`int`): height of the image in pixels
-            channels (`int`): channels of the image
-            interpolations (`str`, default 'linear'): possible values are
+            width (int): width of the image in pixels
+            height (int): height of the image in pixels
+            channels (int): channels of the image
+            interpolations (str, default 'linear'): possible values are
              'nearest', 'linear', 'cubic', and 'lanczos'
 
         Returns:
-            `dict` describing the scale transform
+            dict describing the scale transform
         '''
         return dict(type='Scale', width=width, height=height, channels=channels,
                 interpolations=interpolations)
@@ -402,11 +402,11 @@ class ImageDeserializer(Deserializer):
         Mean transform that can be used to pass to `map_features` for data augmentation.
 
         Args:
-            filename (`str`): file that stores the mean values for each pixel
+            filename (str): file that stores the mean values for each pixel
              in OpenCV matrix XML format
 
         Returns:
-            `dict` describing the mean transform
+            dict describing the mean transform
         '''
         return dict(type='Mean', meanFile=filename)
 
@@ -427,7 +427,7 @@ class CTFDeserializer(Deserializer):
         where
          Sample=|Input_Name (Value )* 
     Args:
-        filename (`str`): file name containing the text input
+        filename (str): file name containing the text input
     See also:
         https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader
     '''
@@ -451,12 +451,12 @@ class CTFDeserializer(Deserializer):
         Example: for node name 'Apples' an input line could look like this:
         |Apples 0 1 2 3 4 5 6 7 8 9
         Args:
-            node (`str` or input node): node or its name
-            dim (`int`): specifies the dimension of the input value vector 
+            node (str or input node): node or its name
+            dim (int): specifies the dimension of the input value vector 
              (for dense input this directly corresponds to the number of values in each sample, 
              for sparse this represents the upper bound on the range of possible index values).
-            format (`str`, default 'dense'): 'dense' or 'sparse'. Specifies the input type. 
-            alias (`str`, default None): None or alias name. Optional abbreviated name that 
+            format (str, default 'dense'): 'dense' or 'sparse'. Specifies the input type. 
+            alias (str, default None): None or alias name. Optional abbreviated name that 
              is used in the text file to avoid repeating long input names. For details please
              see https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader
         '''
@@ -474,13 +474,13 @@ def text_format_minibatch_source(path, stream_configs, epoch_size=INFINITELY_REP
     Creates a minibatch source from a CNTKTextFormatReader file.
 
     Args:
-        path ('file'): filename of the data file
+        path (file): filename of the data file
         stream_configs (`list` of :class:`StreamConfiguration` instances): list
          of stream configurations, each of which describes one stream in the
          file
-        epoch_size (`int`, optional): size of an epoch. In case of 0 the size
+        epoch_size (int, optional): size of an epoch. In case of 0 the size
          of the training set will be taken. Default is max of 64bit.
-        randomize (`bool`, optional): whether to randomize the contents of data file.
+        randomize (bool, optional): whether to randomize the contents of data file.
         distributed_communicator (:class:`~cntk.distributed.communicator`): optional distributed communicator
 
     Returns:
@@ -499,13 +499,13 @@ class StreamConfiguration(cntk_py.StreamConfiguration):
     :func:`text_format_minibatch_source`.
 
     Args:
-        name (`str`): name of this stream
-        dim (`int`): dimensions of this stream. A text format reader reads data
+        name (str): name of this stream
+        dim (int): dimensions of this stream. A text format reader reads data
          as flat arrays. If you need different shapes you can
-         :func:`cntk.ops.reshape` it later.
-        is_sparse (`bool`, default `False`): whether the provided data is sparse
+         :func:`~cntk.ops.reshape` it later.
+        is_sparse (bool, default `False`): whether the provided data is sparse
          (`False` by default)
-        stream_alias (`str`, default ''): name of the stream in the file that is fed to the
+        stream_alias (str, default ''): name of the stream in the file that is fed to the
          :func:`text_format_minibatch_source`
     '''
 
diff --git a/bindings/python/cntk/learner.py b/bindings/python/cntk/learner.py
index 8f26b0d4b..9571a1c6b 100644
--- a/bindings/python/cntk/learner.py
+++ b/bindings/python/cntk/learner.py
@@ -66,7 +66,7 @@ class Learner(cntk_py.Learner):
             gradient_values (`dict`): maps :class:`~cntk.variables.Parameter` to
              a NumPy array containing the first order gradient values for the
              Parameter w.r.t. the training objective.
-            training_sample_count (`int`): training sample count
+            training_sample_count (int): training sample count
 
         Returns:
             `False` to indicate that learning has stopped for all of the parameters associated with this learner
@@ -91,7 +91,7 @@ class Learner(cntk_py.Learner):
         Resets the learning rate.
 
         Args:
-            learning_rate (`float`, `list` or a training schedule): learning rate 
+            learning_rate (float, list or a training schedule): learning rate 
             to reset to
         '''
         learning_rate = learning_rate_schedule(learning_rate)
@@ -102,7 +102,7 @@ class Learner(cntk_py.Learner):
         The learning rate.
 
         Args:
-            minibatch_size (``int``): minibatch size to re-scaled
+            minibatch_size (int): minibatch size to re-scaled
             the learning rate to the per-sample value (in case when the schedule 
             was build with ``unit=UnitType.minibatch``).
         '''
@@ -132,11 +132,11 @@ def training_parameter_schedule(schedule, epoch_size=1, unit=UnitType.sample):
         (0.1, 0.1, 0.01, 0.01, 0.001, 0.001)
 
     Args:
-        schedule (``float`` or ``list``): if ``float``, is the parameter schedule to be used
+        schedule (float or list): if float, is the parameter schedule to be used
          for all samples. In case of list, the elements are used as the
          values for ``epoch_size`` samples. If list contains pair, the second element is
          used as a value for (``epoch_size`` x first element) samples
-        epoch_size (`int`): number of samples as a scheduling unit. Parameters in
+        epoch_size (int): number of samples as a scheduling unit. Parameters in
          the schedule change their values every ``epoch_size`` samples.
         unit (:class:`UnitType`): one of two
 
@@ -145,6 +145,9 @@ def training_parameter_schedule(schedule, epoch_size=1, unit=UnitType.sample):
 
     Returns:
         training parameter schedule
+
+    See also:
+        :func:`learning_rate_schedule`
     '''
     if not isinstance(unit, UnitType):
             raise ValueError('schedule unit "%s" is not supported' %
@@ -176,15 +179,18 @@ def learning_rate_schedule(lr, epoch_size=1, unit=UnitType.sample):
     :func:`training_parameter_schedule`).
 
     Args:
-        lr (``float`` or ``list``): see parameter ``schedule`` in 
+        lr (float or list): see parameter ``schedule`` in 
          :func:`training_parameter_schedule`.
-        epoch_size (``int``): see parameter ``epoch_size`` in 
+        epoch_size (int): see parameter ``epoch_size`` in 
          :func:`training_parameter_schedule`.
         unit (:class:`UnitType`): see parameter 
          ``unit`` in :func:`training_parameter_schedule`.
 
     Returns:
         learning rate schedule
+
+    See also:
+        :func:`training_parameter_schedule`
     '''
     return training_parameter_schedule(lr, epoch_size, unit)
 
@@ -195,9 +201,9 @@ def momentum_schedule(momentum, epoch_size=1, unit=UnitType.sample):
     :func:`training_parameter_schedule`).
 
     Args:
-        momentum (``float`` or ``list``): see parameter ``schedule`` in 
+        momentum (float or list): see parameter ``schedule`` in 
          :func:`training_parameter_schedule`.
-        epoch_size (``int``): see parameter ``epoch_size`` in 
+        epoch_size (int): see parameter ``epoch_size`` in 
          :func:`training_parameter_schedule`.
         unit (:class:`UnitType`): see parameter 
          ``unit`` in :func:`training_parameter_schedule`.
@@ -223,9 +229,9 @@ def momentum_schedule(momentum, epoch_size=1, unit=UnitType.sample):
         (0.99, 0.99, 0.88, 0.88, 0.77)
 
     Args:
-        momentum (``float`` or ``list``): see parameter ``schedule`` in 
+        momentum (float or list): see parameter ``schedule`` in 
          :func:`training_parameter_schedule`.
-        epoch_size (``int``): see parameter ``epoch_size`` in 
+        epoch_size (int): see parameter ``epoch_size`` in 
          :func:`training_parameter_schedule`.
         unit (:class:`UnitType`): see parameter 
          ``unit`` in :func:`training_parameter_schedule`.
@@ -242,9 +248,9 @@ def momentum_as_time_constant_schedule(momentum, epoch_size=1):
     semantics as :func:`training_parameter_schedule`).
 
     Args:
-        momentum (``float`` or ``list``): see parameter ``schedule`` in 
+        momentum (float or list): see parameter ``schedule`` in 
          :func:`training_parameter_schedule`.
-        epoch_size (``int``): see parameter ``epoch_size`` in 
+        epoch_size (int): see parameter ``epoch_size`` in 
          :func:`training_parameter_schedule`.
         unit (:class:`UnitType`): see parameter 
          ``unit`` in :func:`training_parameter_schedule`.
@@ -267,9 +273,9 @@ def momentum_as_time_constant_schedule(momentum, epoch_size=1):
         >>> m = momentum_as_time_constant_schedule([1100, 1500], 1000)
 
     Args:
-        momentum (``float`` or ``list``): see parameter ``schedule`` in 
+        momentum (float or list): see parameter ``schedule`` in 
          :func:`training_parameter_schedule`.
-        epoch_size (``int``): see parameter ``epoch_size`` in 
+        epoch_size (int): see parameter ``epoch_size`` in 
          :func:`training_parameter_schedule`.
 
     Returns:
@@ -299,21 +305,21 @@ def sgd(parameters, lr,
     information on how to set the parameters.
 
     Args:
-        parameters (`list` of parameters): list of network parameters to tune.
+        parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the '.parameters()' method of the root
          operator.
-        lr (``float``, ``list`` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
+        lr (float, list or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping
+        gradient_clipping_with_truncation (bool, default ``True``): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
@@ -346,22 +352,22 @@ def momentum_sgd(parameters, lr, momentum,
     Args:
         parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the root operator's ``parameters``.
-        lr (``float``, `list````` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
+        lr (float, list```` or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        momentum (``float``, ``list`` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
-         value is a ``float`` or a ``list``, momentum is converted to a per-sample schedule by 
+        momentum (float, list or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         value is a float or a list, momentum is converted to a per-sample schedule by 
          invoking :func:`momentum_schedule`. Refer to the `wiki
          <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping
+        gradient_clipping_with_truncation (bool, default ``True``): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
@@ -387,28 +393,28 @@ def nesterov(parameters, lr, momentum,
         gradient_clipping_with_truncation=True):
     '''
     Creates a Nesterov SGD learner instance to learn the parameters. This was
-    originally proposed by Nesterov [1] in 1983 and then proved to work well in
+    originally proposed by Nesterov [1] in 1983 and then shown to work well in
     a deep learning context by Sutskever, et al. [2].
 
     Args:
         parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the root operator's ``parameters``.
-        lr (``float``, ``list`` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a ``float`` or a ``list``, lr is 
+        lr (float, list or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        momentum (``float``, ``list`` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
-         value is a ``float`` or a ``list``, momentum is converted to a per-sample schedule by 
+        momentum (float, list or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         value is a float or a list, momentum is converted to a per-sample schedule by 
          invoking :func:`momentum_schedule`. Refer to the `wiki
          <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default ``True``): gradient clipping
+        gradient_clipping_with_truncation (bool, default ``True``): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the
@@ -449,19 +455,19 @@ def adagrad(parameters, lr, need_ave_multiplier=True,
     Args:
         parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the root operator's ``parameters``.
-        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
+        lr (float, list or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        need_ave_multiplier (``bool``, default):
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        need_ave_multiplier (bool, default):
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping
+        gradient_clipping_with_truncation (bool, default `True`): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
@@ -500,25 +506,25 @@ def adam_sgd(parameters, lr, momentum,
     Args:
         parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the root operator's ``parameters``.
-        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
+        lr (float, list or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        momentum (`float`, `list` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
-         value is a `float` or a `list`, momentum is converted to a per-sample schedule by 
+        momentum (float, list or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): momentum schedule. When the argument 
+         value is a float or a list, momentum is converted to a per-sample schedule by 
          invoking :func:`momentum_schedule`. Refer to the `wiki
          <https://github.com/Microsoft/CNTK/wiki/SGD-block#converting-learning-rate-and-momentum-parameters-from-other-toolkits>`_.
-        variance_momentum (`float`, `list` or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): variance momentum schedule. When the argument 
-         value is a `float` or a `list`, variance momentum is converted to a per-sample schedule by 
+        variance_momentum (float, list or output of :func:`momentum_schedule` or :func:`momentum_as_time_constant_schedule`): variance momentum schedule. When the argument 
+         value is a float or a list, variance momentum is converted to a per-sample schedule by 
          invoking :func:`momentum_schedule`. Defaults to momentum_as_time_constant_schedule(720000).
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping
+        gradient_clipping_with_truncation (bool, default `True`): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
@@ -559,24 +565,24 @@ def rmsprop(parameters, lr,
     Args:
         parameters (list of parameters): list of network parameters to tune.
          These can be obtained by the root operator's ``parameters``.
-        lr (``float``, `list` or output of :func:`learning_rate_schedule`): learning rate 
-         schedule. When the argument value is a `float` or a `list`, lr is 
+        lr (float, list or output of :func:`learning_rate_schedule`): learning rate 
+         schedule. When the argument value is a float or a list, lr is 
          converted to a per-sample schedule by invoking :func:`learning_rate_schedule`.
-        gamma (``float``):
-        inc (``float``):
-        dec (``float``):
-        max (``float``):
-        min (``float``):
-        need_ave_multiplier (``bool``, default):
-        l1_regularization_weight (``float``, optional): the L1 regularization weight per sample,
+        gamma (float):
+        inc (float):
+        dec (float):
+        max (float):
+        min (float):
+        need_ave_multiplier (bool, default):
+        l1_regularization_weight (float, optional): the L1 regularization weight per sample,
          defaults to 0.0
-        l2_regularization_weight (``float``, optional): the L2 regularization weight per sample,
+        l2_regularization_weight (float, optional): the L2 regularization weight per sample,
          defaults to 0.0
-        gaussian_noise_injection_std_dev (``float``, optional): the standard deviation
+        gaussian_noise_injection_std_dev (float, optional): the standard deviation
          of the Gaussian noise added to parameters post update, defaults to 0.0
-        gradient_clipping_threshold_per_sample (``float``, optional): clipping threshold
+        gradient_clipping_threshold_per_sample (float, optional): clipping threshold
          per sample, defaults to infinity
-        gradient_clipping_with_truncation (``bool``, default `True`): gradient clipping
+        gradient_clipping_with_truncation (bool, default `True`): gradient clipping
 
     Returns:
         Instance of a :class:`~cntk.learner.Learner` that can be passed to the :class:`~cntk.trainer.Trainer`
diff --git a/bindings/python/cntk/ops/__init__.py b/bindings/python/cntk/ops/__init__.py
index 626ab819f..cef93b1e6 100644
--- a/bindings/python/cntk/ops/__init__.py
+++ b/bindings/python/cntk/ops/__init__.py
@@ -20,11 +20,11 @@ def combine(operands, name=''):
      with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
 
     Args:
-        operands (`list`): list of functions or their variables to combine
-        name (`str`, optional): the name of the Combine Function in the network
+        operands (list): list of functions or their variables to combine
+        name (str, optional): the name of the Combine Function in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import combine
     converted_operands = list()
@@ -49,10 +49,10 @@ def alias(x, name=''):
 
     Args:
         operand: The Function/Variable to alias
-        name (`str`, optional): the name of the Alias Function in the network
+        name (str, optional): the name of the Alias Function in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import alias
     x = sanitize_input(x)
@@ -73,9 +73,9 @@ def binary_cross_entropy(output, target, name=''):
     Args:
         output: the computed posterior probability from the network
         target: ground-truth label, 0 or 1
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import binary_cross_entropy
     dtype = get_data_type(output, target)
@@ -95,9 +95,9 @@ def weighted_binary_cross_entropy(output, target, weight, name=''):
         output: the computed posterior probability from the network
         target: ground-truth label, 0 or 1
         weight: weight of each example
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import weighted_binary_cross_entropy
     dtype = get_data_type(output, target, weight)
@@ -134,11 +134,11 @@ def cross_entropy_with_softmax(output_vector, target_vector, axis=-1, name=''):
         target_vector: usually it is one-hot vector where the hot bit
          corresponds to the label index. But it can be any probability
          distribution over the labels.
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the cross
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the cross
          entropy will be computed.
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import cross_entropy_with_softmax
     dtype = get_data_type(output_vector, target_vector)
@@ -168,9 +168,9 @@ def squared_error(output, target, name=''):
         output: the output values from the network
         target: it is usually a one-hot vector where the hot bit
          corresponds to the label index
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import squared_error
     dtype = get_data_type(output, target)
@@ -204,11 +204,11 @@ def classification_error(output_vector, target_vector, axis=-1, topN=1, name='')
         output_vector: the output values from the network
         target_vector: it is one-hot vector where the hot bit corresponds to
          the label index.
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the
          classification error will be computed.
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import classification_error
     dtype = get_data_type(output_vector, target_vector)
@@ -260,7 +260,7 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
         convolution_map: convolution filter weights, stored as a tensor of dimensions :math:`[O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n]`,
          where :math:`[m_1 \\times m_2 \\times \\ldots \\times m_n]` must be the kernel dimensions (spatial extent of the filter).
         operand: convolution input. A tensor with dimensions :math:`[I \\times M_1 \\times M_2 \\times \\ldots \\times M_n]`.
-        strides (`tuple`, optional): stride dimensions. If strides[i] > 1 then only pixel positions that are multiples of strides[i] are computed.
+        strides (tuple, optional): stride dimensions. If strides[i] > 1 then only pixel positions that are multiples of strides[i] are computed.
          For example, a stride of 2 will lead to a halving of that dimension. The first stride dimension that lines up with the number
          of input channels can be set to any non-zero value.
         sharing (bool): sharing flags for each input dimension
@@ -276,9 +276,9 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
          operations. Some convolution engines (e.g. cuDNN and GEMM-based engines) can benefit from using workspace as it may improve
          performance. However, sometimes this may lead to higher memory utilization. Default is 0 which means the same as the input
          samples.
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import convolution
     operand = sanitize_input(operand)
@@ -301,9 +301,9 @@ def roipooling(conv_feature_map, rois, roi_output_shape, name=''):
         conv_feature_map: a convolutional feature map as the input volume ([W x H x C x N]).
         rois: the coordinates of the ROIs per image ([4 x roisPerImage x N]), each ROI is (x, y, w, h) relative to original image size.
         roi_output_shape: dimensions (width x height) of the ROI pooling output shape
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import roipooling
     conv_feature_map = sanitize_input(conv_feature_map)
@@ -344,9 +344,9 @@ def pooling(operand, pooling_type, pooling_window_shape, strides=(1,), auto_padd
         auto_padding: automatic padding flags for each input dimension.
         lower_pad: precise lower padding for each input dimension
         upper_pad: precise upper padding for each input dimension
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import pooling
     operand = sanitize_input(operand)
@@ -376,17 +376,17 @@ def batch_normalization(operand, scale, bias, running_mean, running_inv_std, spa
          training as well. You must pass a parameter tensor with initial value 0 and the same dimensions
          as ``scale`` and ``bias``
         running_inv_std: running variance. Represented as ``running_mean``
-        spatial(`bool`): flag that indicates whether to compute mean/var for each feature in a minibatch
+        spatial(bool): flag that indicates whether to compute mean/var for each feature in a minibatch
          independently or, in case of convolutional layers, per future map
-        normalization_time_constant(`float`, default 5000): time constant for computing running average of
+        normalization_time_constant(float, default 5000): time constant for computing running average of
          mean and variance as a low-pass filtered version of the batch statistics.
-        blend_time_constant(`float`, default 0): constant for smoothing batch estimates with the running
+        blend_time_constant(float, default 0): constant for smoothing batch estimates with the running
          statistics
         epsilon: conditioner constant added to the variance when computing the inverse standard deviation
-        use_cudnn_engine(`bool`, default True):
-        name (`str`, optional): the name of the Function instance in the network
+        use_cudnn_engine(bool, default True):
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import batch_normalization
     operand = sanitize_input(operand)
@@ -414,9 +414,9 @@ def less(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import less
     dtype = get_data_type(left, right)
@@ -440,9 +440,9 @@ def equal(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import equal
     dtype = get_data_type(left, right)
@@ -466,9 +466,9 @@ def greater(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import greater
     dtype = get_data_type(left, right)
@@ -492,9 +492,9 @@ def greater_equal(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import greater_equal
     dtype = get_data_type(left, right)
@@ -518,9 +518,9 @@ def not_equal(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import not_equal
     dtype = get_data_type(left, right)
@@ -544,9 +544,9 @@ def less_equal(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import less_equal
     dtype = get_data_type(left, right)
@@ -574,9 +574,9 @@ def plus(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import plus
     dtype = get_data_type(left, right)
@@ -601,9 +601,9 @@ def minus(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
 
     from cntk.cntk_py import minus
@@ -629,9 +629,9 @@ def element_times(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import element_times
     dtype = get_data_type(left, right)
@@ -656,9 +656,9 @@ def element_divide(left, right, name=''):
     Args:
         left: left side tensor
         right: right side tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import element_divide
     dtype = get_data_type(left, right)
@@ -700,14 +700,14 @@ def times(left, right, output_rank=1, infer_input_rank_to_map=-1, name=''):
     Args:
         left: left side matrix or tensor
         right: right side matrix or tensor
-        output_rank (`int`): in case we have tensors as arguemnts, output_rank represents
+        output_rank (int): in case we have tensors as arguemnts, output_rank represents
             the number of axes to be collapsed in order to transform the tensors
             into matrices, perform the operation and then reshape back (explode the axes)
         infer_input_rank_to_map ('int'): meant for internal use only. Always use default value
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import times
     dtype = get_data_type(left, right)
@@ -801,10 +801,10 @@ def times_transpose(left, right, name=''):
     Args:
         left: left side tensor
         right: right side matrix or vector
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import times_transpose
     dtype = get_data_type(left, right)
@@ -841,9 +841,9 @@ def floor(arg, name=''):
 
     Args:
         arg: input tensor
-        name (`str`, optional): the name of the Function instance in the network (optional)
+        name (str, optional): the name of the Function instance in the network (optional)
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import floor
     arg = sanitize_input(arg, get_data_type(arg))
@@ -866,9 +866,9 @@ def ceil(arg, name=''):
 
     Args:
         arg: input tensor
-        name (`str`, optional): the name of the Function instance in the network (optional)
+        name (str, optional): the name of the Function instance in the network (optional)
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import ceil
     arg = sanitize_input(arg, get_data_type(arg))
@@ -901,9 +901,9 @@ def round(arg, name=''):
 
     Args:
         arg: input tensor
-        name (`str`, optional): the name of the Function instance in the network (optional)
+        name (str, optional): the name of the Function instance in the network (optional)
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import round
     arg = sanitize_input(arg, get_data_type(arg))
@@ -931,13 +931,13 @@ def clip(x, min_value, max_value, name=''):
 
     Args:
         x: tensor to be clipped
-        min_value (`float`): a scalar or a tensor which represents the minimum value to clip element
+        min_value (float): a scalar or a tensor which represents the minimum value to clip element
          values to
-        max_value (`float`): a scalar or a tensor which represents the maximum value to clip element
+        max_value (float): a scalar or a tensor which represents the maximum value to clip element
          values to
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import clip
     x = sanitize_input(x, get_data_type(x))
@@ -959,10 +959,10 @@ def relu(x, name=''):
         array([[ 0.,  0.,  0.,  1.,  2.]], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import re_lu
     x = sanitize_input(x)
@@ -983,10 +983,10 @@ def sigmoid(x, name=''):
         array([ 0.119203,  0.268941,  0.5     ,  0.731059,  0.880797], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import sigmoid
     x = sanitize_input(x)
@@ -1006,10 +1006,10 @@ def tanh(x, name=''):
                [ 0.995055,  0.999329]], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import tanh
     x = sanitize_input(x)
@@ -1038,10 +1038,10 @@ def softmax(x, name=''):
         array([ 0.5,  0.5], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import softmax
     x = sanitize_input(x)
@@ -1062,10 +1062,10 @@ def hardmax(x, name=''):
         array([ 0.,  1.,  0.,  0.], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`): the name of the node in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str): the name of the node in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import hardmax
     x = sanitize_input(x)
@@ -1084,10 +1084,10 @@ def exp(x, name=''):
         array([ 1.      ,  2.718282], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import exp
     x = sanitize_input(x)
@@ -1104,10 +1104,10 @@ def log(x, name=''):
         array([ 0.      ,  0.693147], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
 
     Note:
         CNTK returns -85.1 for log(x) if ``x`` is negative or zero. The reason is that
@@ -1132,10 +1132,10 @@ def sqrt(x, name=''):
         array([ 0.,  2.], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
 
     Note:
         CNTK returns zero for sqrt of negative nubmers, this will be changed to
@@ -1156,10 +1156,10 @@ def square(x, name=''):
         array([   1.,  100.], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import square
     x = sanitize_input(x)
@@ -1178,10 +1178,10 @@ def abs(x, name=''):
         array([ 1.,  1.,  2.,  3.], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import abs
     x = sanitize_input(x)
@@ -1200,10 +1200,10 @@ def negate(x, name=''):
         array([ 1., -1.,  2., -3.], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import negate
     x = sanitize_input(x)
@@ -1220,10 +1220,10 @@ def reciprocal(x, name=''):
         array([-3.      ,  5.      , -0.5     ,  0.333333], dtype=float32)
 
     Args:
-        x: numpy array or any :class:`cntk.ops.functions.Function` that outputs a tensor
-        name (`str`, optional): the name of the Function instance in the network
+        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reciprocal
     x = sanitize_input(x)
@@ -1245,9 +1245,9 @@ def element_select(flag, value_if_true, value_if_false, name=''):
         flag: condition tensor
         value_if_true: true branch tensor
         value_if_false: false branch tensor
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import element_select
     flag = sanitize_input(flag)
@@ -1295,10 +1295,10 @@ def future_value(x, initial_state=None, time_step=1, name=''):
         x: the tensor (or its name) from which the future value is obtained.
         initial_state: tensor or scalar representing the initial value to be
         used when the input tensor is shifted in time.
-        time_step (`int`): the number of time steps to look into the future (default 1)
-        name (`str`, optional): the name of the Function instance in the network
+        time_step (int): the number of time steps to look into the future (default 1)
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
 
     from ..utils import sanitize_dtype_cntk
@@ -1347,11 +1347,11 @@ def past_value(x, initial_state=None, time_step=1, name=''):
         x: the tensor (or its name) from which the past value is obtained
         initial_state: tensor or scalar representing the initial value to be
         used when the input tensor is shifted in time.
-        time_step (`int`): the number of time steps to look into the past (default 1)
-        name (`str`, optional): the name of the Function instance in the network
+        time_step (int): the number of time steps to look into the past (default 1)
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
 
     from ..utils import sanitize_dtype_cntk
@@ -1387,10 +1387,10 @@ def reshape(x, shape, name=''):
 
     Args:
         x: tensor to be reshaped
-        shape (`tuple`): a tuple defining the resulting shape
-        name (`str`, optional): the name of the Function instance in the network
+        shape (tuple): a tuple defining the resulting shape
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     if np.any(np.asarray(shape) < 0):
         # TODO decide on whether -1 instead of 0 should be used to infer the
@@ -1417,11 +1417,11 @@ def transpose(x, axis1=0, axis2=1, name=''):
 
     Args:
         x: tensor to be transposed
-        axis1 (`int` or :class:`cntk.axis.Axis`): the axis to swap with ``axis2``
-        axis2 (`int` or :class:`cntk.axis.Axis`): the axis to swap with ``axis1``
-        name (`str`, optional): the name of the Function instance in the network
+        axis1 (int or :class:`~cntk.axis.Axis`): the axis to swap with ``axis2``
+        axis2 (int or :class:`~cntk.axis.Axis`): the axis to swap with ``axis1``
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import transpose_axes
     x = sanitize_input(x)
@@ -1471,17 +1471,17 @@ def slice(x, axis, begin_index, end_index, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which ``begin_index`` and ``end_index``
-         will be used. If it is of type `int` it will be used as a static axis.
-        begin_index (`int`): the index along axis where the slicing starts
-        end_index (`int`): the index along axis where the slicing ends
-        name (`str`, optional): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which ``begin_index`` and ``end_index``
+         will be used. If it is of type int it will be used as a static axis.
+        begin_index (int): the index along axis where the slicing starts
+        end_index (int): the index along axis where the slicing ends
+        name (str, optional): the name of the Function instance in the network
 
     See also:
         Indexing in NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import slice
     x = sanitize_input(x)
@@ -1516,13 +1516,13 @@ def splice(inputs, axis=-1, name=''):
                 [ 50.,  60.]]], dtype=float32)
 
     Args:
-        inputs (`list`): tuple of input tensors
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the
+        inputs (list): tuple of input tensors
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the
          concatenation will be performed
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import splice
     if type(inputs) not in (list, tuple):
@@ -1576,11 +1576,11 @@ def reduce_sum(x, axis=None, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the reduction will be performed
-        name (`str`, optional): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the reduction will be performed
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reduce_sum
     x = sanitize_input(x)
@@ -1605,11 +1605,11 @@ def reduce_log_sum(x, axis=None, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the reduction will be performed
-        name (`str`): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the reduction will be performed
+        name (str): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reduce_log_sum
     x = sanitize_input(x)
@@ -1636,11 +1636,11 @@ def reduce_mean(x, axis=None, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the reduction will be performed
-        name (`str`, optional): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the reduction will be performed
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reduce_mean
     x = sanitize_input(x)
@@ -1667,11 +1667,11 @@ def reduce_max(x, axis=None, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the reduction will be performed
-        name (`str`): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the reduction will be performed
+        name (str): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reduce_max
     x = sanitize_input(x)
@@ -1698,11 +1698,11 @@ def reduce_min(x, axis=None, name=''):
 
     Args:
         x: input tensor
-        axis (`int` or :class:`cntk.axis.Axis`): axis along which the reduction will be performed
-        name (`str`): the name of the Function instance in the network
+        axis (int or :class:`~cntk.axis.Axis`): axis along which the reduction will be performed
+        name (str): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import reduce_min
     x = sanitize_input(x)
@@ -1732,12 +1732,12 @@ def random_sample(weights, num_samples, allow_duplicates, name=''):
     Args:
         weights: input vector of sampling weights which should be
             non-negative numbers.
-        num_samples (`int`): number of expected samples
-        allow_duplicates (`bool`): If sampling is done
+        num_samples (int): number of expected samples
+        allow_duplicates (bool): If sampling is done
             with replacement (`True`) or without (`False`).
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
 
     '''
     from cntk.cntk_py import random_sample
@@ -1765,10 +1765,10 @@ def random_sample_inclusion_frequency(
 
     Args:
         weights: input vector of sampling weights which should be
-            non-negative numbers.
-        num_samples (`int`): number of expected samples
-        allow_duplicates (`bool`): If sampling is done
-            with replacement (`True`) or without (`False`).
+         non-negative numbers.
+        num_samples (int): number of expected samples
+        allow_duplicates (bool): If sampling is done
+         with replacement (`True`) or without (`False`).
 
     Examples:
         >>> import numpy as np
@@ -1792,7 +1792,7 @@ def random_sample_inclusion_frequency(
         1.0
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import random_sample_inclusion_frequency
     weights = sanitize_input(weights)
@@ -1830,11 +1830,11 @@ def dropout(x, dropout_rate=0.0, name=''):
 
     Args:
         x: input tensor
-        dropout_rate (`float`, [0,1)): probability that an element of ``x`` will be set to zero
-        name (:class:`str`, optional): the name of the Function instance in the network
+        dropout_rate (float, [0,1)): probability that an element of ``x`` will be set to zero
+        name (:class:str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     if dropout_rate < 0.0 or dropout_rate >= 1.0:
         raise ValueError('dropout_rate must be in the interval [0,1)')
@@ -1862,15 +1862,15 @@ def input_variable(shape, dtype=np.float32, needs_gradient=True, is_sparse=False
     It creates an input node.
 
     Args:
-        shape (`tuple` or `int`): the shape of the input tensor
-        dtype (`type`, optional): np.float32 (default) or np.float64
-        needs_gradients (`bool`, optional): whether to back-propagates to it or not. True by default.
-        is_sparse (`bool`, optional): whether the variable is sparse (`False` by default)
-        dynamic_axes (`list` or `tuple`, default): a list of dynamic axis (e.g., batch axis, time axis)
-        name (`str`, optional): the name of the Function instance in the network
+        shape (tuple or int): the shape of the input tensor
+        dtype (type, optional): np.float32 (default) or np.float64
+        needs_gradients (bool, optional): whether to back-propagates to it or not. True by default.
+        is_sparse (bool, optional): whether the variable is sparse (`False` by default)
+        dynamic_axes (list or tuple, default): a list of dynamic axis (e.g., batch axis, time axis)
+        name (str, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.variables.Variable`
+        :class:`~cntk.ops.variables.Variable`
     '''
     from cntk.cntk_py import input_variable
     from ..utils import sanitize_shape, sanitize_dtype_cntk
@@ -1895,11 +1895,11 @@ def placeholder_variable(shape=None, dynamic_axes=None, name=''):
     are unfolded, the place holder will get assigned a variable along the correspondent dynamic axis.
 
     Args:
-        shape (`tuple` or `int`): the shape of the variable tensor
-        dynamic_axes (`list`): the list of dynamic axes that the actual variable uses
+        shape (tuple or int): the shape of the variable tensor
+        dynamic_axes (list): the list of dynamic axes that the actual variable uses
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import placeholder_variable, NDShape, Axis
 
@@ -1928,7 +1928,7 @@ def parameter(shape=None, init=None, device=None, name=''):
                [ 2.,  2.,  2.,  2.]], dtype=float32)
 
     Args:
-        shape (`tuple` or `int`, optional): the shape of the input tensor. If not provided, it
+        shape (tuple or int, optional): the shape of the input tensor. If not provided, it
          will be inferred from ``value``.
         init (scalar or NumPy array or initializer): if init is a scalar
          it will be replicated for every element in the tensor or
@@ -1936,11 +1936,11 @@ def parameter(shape=None, init=None, device=None, name=''):
          :mod:`cntk.initializer` it will be used to initialize the tensor at
          the first forward pass. If `None`, the tensor will be initialized
          with 0.
-        device (:class:`cntk.device.DeviceDescriptor`): instance of DeviceDescriptor
-        name (`str`, optional): the name of the Parameter instance in the network
+        device (:class:`~cntk.device.DeviceDescriptor`): instance of DeviceDescriptor
+        name (str, optional): the name of the Parameter instance in the network
 
     Returns:
-        :class:`cntk.ops.variables.Parameter`
+        :class:`~cntk.ops.variables.Parameter`
     '''
 
     from .variables import Parameter
@@ -1975,12 +1975,12 @@ def constant(value=None, shape=None, device=None, name=''):
         value (scalar or NumPy array, optional): a scalar initial value that would be replicated for
          every element in the tensor or NumPy array.
          If ``None``, the tensor will be initialized uniformly random.
-        shape (`tuple` or `int`, optional): the shape of the input tensor. If not provided, it will
+        shape (tuple or int, optional): the shape of the input tensor. If not provided, it will
          be inferred from ``value``.
-        device (:class:`cntk.device.DeviceDescriptor`): instance of DeviceDescriptor
-        name (`str`, optional): the name of the Function instance in the network
+        device (:class:`~cntk.device.DeviceDescriptor`): instance of DeviceDescriptor
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.variables.Constant`
+        :class:`~cntk.ops.variables.Constant`
     '''
     from .variables import Constant
     if not device:
@@ -2011,9 +2011,9 @@ def per_dim_mean_variance_normalize(operand, mean, inv_stddev, name=''):
         operand: the variable to be normalized
         mean (NumPy array): per dimension mean to use for the normalization
         inv_stddev (NumPy array): per dimension standard deviation to use for the normalization
-        name (`str`, optional): the name of the Function instance in the network
+        name (str, optional): the name of the Function instance in the network
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import per_dim_mean_variance_normalize
     mean = sanitize_input(mean, get_data_type(mean))
diff --git a/bindings/python/cntk/ops/functions.py b/bindings/python/cntk/ops/functions.py
index c0690b7e6..e610485cf 100644
--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@@ -5,7 +5,7 @@ from enum import Enum, unique
 @unique
 class CloneMethod(Enum):
     '''
-    Describes different ways how :class:`~cntk.ops.functions.Function.forward`
+    Describes different ways how :func:`~cntk.ops.functions.Function.clone`
     works.
     '''
 
@@ -125,11 +125,11 @@ class Function(cntk_py.Function):
         Args:
             method (:class:`CloneMethod`): one of
 
-             * 'cloned': the returned function gets its own copy of parameters (default)
-             * 'shared': the returned function shares its parameters with this function
-             * 'constant': parameters are cloned and made immutable (constant).
+             * 'clone': the returned function gets its own copy of parameters (default)
+             * 'share': the returned function shares its parameters with this function
+             * 'freeze': parameters are cloned and made immutable (constant).
 
-            substitutions (`dict`): a dictionary mapping variables in this
+            substitutions (dict): a dictionary mapping variables in this
              function to variables in the cloned function
 
         Returns:
@@ -157,9 +157,9 @@ class Function(cntk_py.Function):
             arguments: maps variables to their input data. The interpretation depends on
              the input type:
 
-               * `dict`: keys are input variable or names, and values are the input data.
+               * dict: keys are input variable or names, and values are the input data.
                * any other type: if node has an unique input, ``arguments`` is mapped to this input.
-                For nodes with more than one input, only `dict` is allowed.
+                For nodes with more than one input, only dict is allowed.
              In both cases, every every sample in the data will be interpreted
              as a new sequence. To mark samples as continuations of the
              previous sequence, specify ``arguments`` as `tuple`: the
@@ -208,9 +208,9 @@ class Function(cntk_py.Function):
             arguments: maps variables to their
              input data. The interpretation depends on the input type:
 
-               * `dict`: keys are input variable or names, and values are the input data.
+               * dict: keys are input variable or names, and values are the input data.
                * any other type: if node has an unique input, ``arguments`` is mapped to this input.
-                For nodes with more than one input, only `dict` is allowed.
+                For nodes with more than one input, only dict is allowed.
              In both cases, every every sample in the data will be interpreted
              as a new sequence. To mark samples as continuations of the
              previous sequence, specify ``arguments`` as ``tuple``: the
@@ -220,7 +220,7 @@ class Function(cntk_py.Function):
              Data should be either NumPy arrays or a
              :class:`~cntk.io.MinibatchData` instance.
             outputs (iterable): outputs to fetch values for.
-            keep_for_backward (`set`, default `None`): the subset of the
+            keep_for_backward (set, default `None`): the subset of the
              Function's output variables for which gradients shall be calculated
              in a subsequent backward call. If `None`, the returned state will
              be `None` and a subsequent call to :func:`backward` will not be
@@ -230,7 +230,7 @@ class Function(cntk_py.Function):
              computation is. If `None`, the default device is used.
 
         Returns:
-             A tuple (`BackpropState`, `map` of outputs to NumPy arrays). The
+             A tuple (BackpropState, map of outputs to NumPy arrays). The
              BackpropState is a handle taken by :func:`backward`.
         '''
         if device is None:
@@ -270,15 +270,15 @@ class Function(cntk_py.Function):
             array([[[ 0.25]]], dtype=float32)
 
         Args:
-            state (`BackPropState`): state obtained from a previous call to the
+            state (BackPropState): state obtained from a previous call to the
              func:`cntk.ops.Function.forward` method on this Function for the
              computation that this gradient backpropagation corresponds to.
-            root_gradients (`dict`): the gradients that will be backpropagated
-            variables (`set`): a list of input variables with respect to which
+            root_gradients (dict): the gradients that will be backpropagated
+            variables (set): a list of input variables with respect to which
              the gradients have to be computed.
 
         Returns:
-            `dict`: mapping of ``variables`` to NumPy arrays
+            dict: mapping of ``variables`` to NumPy arrays
         '''
         root_gradients = sanitize_var_map(self.outputs, root_gradients)
 
@@ -353,7 +353,7 @@ class Function(cntk_py.Function):
         specified replacements in the map.
 
         Args:
-            substitutions (``dict``): map from placeholder to variables
+            substitutions (dict): map from placeholder to variables
 
         Returns:
             :class:`Function`: itself
@@ -383,8 +383,8 @@ class Function(cntk_py.Function):
         Save this function graph into a model file
 
         Args:
-            filename (`str`): model path
-            use_legacy_format (`str`): if 'True', model is stored using legacy format.
+            filename (str): model path
+            use_legacy_format (str): if 'True', model is stored using legacy format.
              Otherwise, it's stored using protobuf-based protocol serialization.
         '''
         return super(Function, self).save_model(filename, use_legacy_format)
@@ -395,7 +395,7 @@ class Function(cntk_py.Function):
         Restore the models parameters from a saved model file
 
         Args:
-            filename (`str`): saved model path 
+            filename (str): saved model path 
 
         Returns:
             `None`: this method only has the side-effect of loading the model parameters from the file
diff --git a/bindings/python/cntk/ops/sequence/__init__.py b/bindings/python/cntk/ops/sequence/__init__.py
index a15732775..613c0fd3d 100644
--- a/bindings/python/cntk/ops/sequence/__init__.py
+++ b/bindings/python/cntk/ops/sequence/__init__.py
@@ -30,7 +30,7 @@ def is_first(seq, name=''):
         name (str): the name of the node in the network
 
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import is_first
     seq = sanitize_input(seq, get_data_type(seq))
@@ -57,7 +57,7 @@ def is_last(seq, name=''):
         name (str): the name of the node in the network
 
     Returns:
-        :class:`cntk.Function`:
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import is_last
     seq = sanitize_input(seq, get_data_type(seq))
@@ -80,7 +80,7 @@ def slice(seq, begin_index, end_index, name=''):
         Indexing in NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import sequence_slice
     seq = sanitize_input(seq, get_data_type(seq))
@@ -106,7 +106,7 @@ def first(seq, name=''):
         seq: the symbolic tensor denoting a sequence
         name (str): the name of the node in the network
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import first
     seq = sanitize_input(seq, get_data_type(seq))
@@ -134,7 +134,7 @@ def last(seq, name=''):
         name (str): the name of the node in the network
 
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import last
     seq = sanitize_input(seq, get_data_type(seq))
@@ -164,7 +164,7 @@ def where(condition, name=''):
         name (str): the name of the node in the network
 
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import where
     condition = sanitize_input(condition, get_data_type(condition))
@@ -199,7 +199,7 @@ def gather(seq, condition, name=''):
             elements should be selected
         name (str): the name of the node in the network
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import gather
     seq = sanitize_input(seq, get_data_type(seq))
@@ -249,7 +249,7 @@ def scatter(seq, condition, name=''):
             elements should be copied
         name (str): the name of the node in the network
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import scatter
     seq = sanitize_input(seq, get_data_type(seq))
@@ -296,7 +296,7 @@ def broadcast_as(operand, broadcast_as_operand, name=''):
         name (str): the name of the node in the network
 
     Returns:
-        :class:`cntk.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import broadcast_as
     operand = sanitize_input(operand, get_data_type(operand))
@@ -316,7 +316,7 @@ def reduce_sum(seq, name=''):
         name (`str`, optional): the name of the Function instance in the network
 
     Returns:
-        :class:`cntk.ops.functions.Function`
+        :class:`~cntk.ops.functions.Function`
     '''
     from cntk.cntk_py import sequence_reduce_sum
     seq = sanitize_input(seq, get_data_type(seq))
diff --git a/bindings/python/cntk/persist.py b/bindings/python/cntk/persist.py
index afcbc16f1..c54aadf2e 100644
--- a/bindings/python/cntk/persist.py
+++ b/bindings/python/cntk/persist.py
@@ -13,9 +13,9 @@ def save_model(root_op, filename, use_legacy_format=True):
     Save the network of ``root_op`` in ``filename``.
 
     Args:
-        root_op (:class:`cntk.functions.Function`): op of the graph to save
-        filename (`str`): filename to store the model in
-        use_legacy_format (`str`): if 'True', model is stored using legacy format.
+        root_op (:class:`~cntk.functions.Function`): op of the graph to save
+        filename (str): filename to store the model in
+        use_legacy_format (str): if 'True', model is stored using legacy format.
              Otherwise, it's stored using protobuf-based protocol serialization.
     '''
     root_op.save_model(filename, use_legacy_format)
@@ -27,10 +27,10 @@ def load_model(filename, dtype=np.float32, device=None):
     `:func:save_model`.
 
     Args:
-        filename (`str`): filename to load the model from
-        dtype ('float', 'double', or NumPy type, default ``np.float32``): data
+        filename (str): filename to load the model from
+        dtype ('float', 'double', or NumPy type, default np.float32): data
          type of the operation
-        device (:class:`cntk.DeviceDescriptor`, default is the default device):
+        device (:class:`~cntk.DeviceDescriptor`, default is the default device):
          instance of DeviceDescriptor
 
     Returns:
diff --git a/bindings/python/cntk/trainer.py b/bindings/python/cntk/trainer.py
index 7c85122e9..6cba52a17 100644
--- a/bindings/python/cntk/trainer.py
+++ b/bindings/python/cntk/trainer.py
@@ -26,7 +26,7 @@ class Trainer(cntk_py.Trainer):
        model (:class:`~cntk.ops.functions.Function`): root node of the function to train
        loss_function (:class:`~cntk.ops.functions.Function`): loss function 
        eval_function (:class:`~cntk.ops.functions.Function`): evaluation function
-       parameter_learners (`list`): list of learners from :mod:`cntk.learner`
+       parameter_learners (list): list of learners from :mod:`cntk.learner`
        distributed_trainer (:class:`~cntk.distributed.distributed_trainer`): distributed trainer
     '''
     def __init__(self, model, loss_function, eval_function, parameter_learners, distributed_trainer=None):
@@ -133,8 +133,8 @@ class Trainer(cntk_py.Trainer):
         specified file location.
 
         Args:
-            filename (`str`): filename to store the checkpoint
-            use_legacy_format (`str`): if 'True', model is stored using legacy format.
+            filename (str): filename to store the checkpoint
+            use_legacy_format (str): if 'True', model is stored using legacy format.
              Otherwise, it's stored using protobuf-based protocol serialization.
         '''
 
@@ -146,7 +146,7 @@ class Trainer(cntk_py.Trainer):
         specified file location.
 
         Args:
-            filename (`str`): filename to restore the checkpoint from
+            filename (str): filename to restore the checkpoint from
         '''
 
         super(Trainer, self).restore_from_checkpoint(filename)