CNTK v2 library: Migrate past_value and future_value to sequence

2017-03-30 02:22:55 -07:00 · 2017-03-30 02:22:55 -07:00 · 8a9020f78e
--- a/Examples/LanguageUnderstanding/ReasoNet/utils.py
+++ b/Examples/LanguageUnderstanding/ReasoNet/utils.py
@ -6,7 +6,7 @@ from cntk import Trainer, Axis, device, combine
 from cntk.layers.blocks import Stabilizer, _initializer_for,  _INFERRED, Parameter, Placeholder
 from cntk.layers import Recurrence, Convolution, Dense
 from cntk.ops import input, sequence, reduce_sum, \
-    parameter, times, element_times, past_value, plus, placeholder, reshape, constant, sigmoid, convolution, tanh, times_transpose, greater, element_divide, element_select, exp, future_value, past_value
+    parameter, times, element_times, plus, placeholder, reshape, constant, sigmoid, convolution, tanh, times_transpose, greater, element_divide, element_select, exp
 from cntk.losses import cosine_distance
 from cntk.internal import _as_tuple, sanitize_input
 from cntk.initializer import uniform, glorot_uniform
--- a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py
+++ b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py
@ -10,7 +10,7 @@ import os
 from cntk import Trainer, Axis
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT
 from cntk.learners import momentum_sgd, fsadagrad, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType
-from cntk import input, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, \
+from cntk import input, cross_entropy_with_softmax, classification_error, sequence, \
                 element_select, alias, hardmax, placeholder, combine, parameter, times, plus
 from cntk.ops.functions import CloneMethod, load_model, Function
 from cntk.initializer import glorot_uniform
--- a/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py
+++ b/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence_Distributed.py
@ -17,7 +17,7 @@ from cntk.device import try_set_default_device, gpu
 from cntk.train.distributed import *
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.learners import learning_rate_schedule, UnitType, momentum_sgd, momentum_as_time_constant_schedule
-from cntk import input, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, alias, hardmax
+from cntk import input, cross_entropy_with_softmax, classification_error, sequence, element_select, alias, hardmax
 from cntk.ops.functions import CloneMethod
 from cntk.train.training_session import *
 from cntk.logging import *
@ -87,7 +87,7 @@ def create_network(input_vocab_dim, label_vocab_dim):
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
-            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)
+            encoder_outputH.output, hidden_dim, hidden_dim, sequence.future_value, sequence.future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)
@ -100,20 +100,20 @@ def create_network(input_vocab_dim, label_vocab_dim):
    # Decoder
    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence

-    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
+    decoder_input = element_select(is_first_label, label_sentence_start_scattered, sequence.past_value(
        decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
-            recurrence_hookH = past_value
-            recurrence_hookC = past_value
+            recurrence_hookH = sequence.past_value
+            recurrence_hookC = sequence.past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
-                isFirst, thought_vector_broadcastH, past_value(operand))
+                isFirst, thought_vector_broadcastH, sequence.past_value(operand))
            recurrence_hookC = lambda operand: element_select(
-                isFirst, thought_vector_broadcastC, past_value(operand))
+                isFirst, thought_vector_broadcastC, sequence.past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)
--- a/Examples/common/nn.py
+++ b/Examples/common/nn.py
@ -168,7 +168,7 @@ def LSTMP_cell_with_self_stabilization(input, prev_output, prev_cell_state):
    return (times(element_times(expsWmr, mt), Wmr), ct)


-def LSTMP_component_with_self_stabilization(input, output_dim, cell_dim, recurrence_hookH=past_value, recurrence_hookC=past_value):
+def LSTMP_component_with_self_stabilization(input, output_dim, cell_dim, recurrence_hookH=sequence.past_value, recurrence_hookC=sequence.past_value):
    dh = placeholder(
        shape=(output_dim), dynamic_axes=input.dynamic_axes)
    dc = placeholder(
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/language_understanding_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/language_understanding_test.py
@ -44,7 +44,7 @@ def create_test_model():

 def with_lookahead():
    x = placeholder()
-    future_x = future_value(x)
+    future_x = sequence.future_value(x)
    apply_x = splice (x, future_x)
    return apply_x

--- a/Tests/EndToEndTests/CNTKv2Python/Examples/layers_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/layers_test.py
@ -89,10 +89,9 @@ if __name__=='__main__':
        array([[7.2,8.2]]),
        array([[7.3,8.3], [7.31, 8.31]]),
    ]
-    from cntk.ops import past_value, future_value
    data_seq_axis = Axis('inputAxis')
    init_seq_axis = Axis('initAxis')
-    f = past_value(sequence.input(2, sequence_axis=data_seq_axis), time_step=2, initial_state=sequence.input(2, sequence_axis=init_seq_axis))
+    f = sequence.past_value(sequence.input(2, sequence_axis=data_seq_axis), time_step=2, initial_state=sequence.input(2, sequence_axis=init_seq_axis))
    res = f(data, initial_state)
    print(res)

--- a/Tests/EndToEndTests/CNTKv2Python/Examples/reasonet_test.py
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/reasonet_test.py
@ -9,7 +9,7 @@ module_path = os.path.join(py_path, 'ReasoNet')
 import cntk.device as device
 import numpy as np
 from cntk.ops.tests.ops_test_utils import cntk_device
-from cntk.ops import input, past_value, future_value
+from cntk.ops import input
 from cntk.io import MinibatchSource
 from cntk import Trainer, Axis, device, combine
 from cntk.layers import Recurrence, Convolution
--- a/Tutorials/CNTK_202_Language_Understanding.ipynb
+++ b/Tutorials/CNTK_202_Language_Understanding.ipynb
@ -833,7 +833,7 @@
    "\n",
    "* takes no input arguments\n",
    "* creates a placeholder (sequence) variable\n",
-    "* computes the \"next value\" in this sequence using the `future_value()` operation and\n",
+    "* computes the \"next value\" in this sequence using the `sequence.future_value()` operation and\n",
    "* concatenates the current and the next value into a vector of twice the embedding dimension using `splice()`\n",
    "\n",
    "and then insert this function into `Sequential()`'s list right after the embedding layer."
@ -1055,7 +1055,7 @@
   "source": [
    "def OneWordLookahead():\n",
    "    x = C.placeholder()\n",
-    "    apply_x = splice (x, future_value(x))\n",
+    "    apply_x = splice (x, sequence.future_value(x))\n",
    "    return apply_x\n",
    "\n",
    "def create_model():\n",
--- a/Tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
+++ b/Tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
@ -238,6 +238,7 @@
    "#from keras.optimizers import *\n",
    "from cntk import *\n",
    "from cntk.layers import *\n",
+    "from cntk.ops.sequence import input\n",
    "# Select the right target device when this notebook is being tested:\n",
    "if 'TEST_DEVICE' in os.environ:\n",
    "    if os.environ['TEST_DEVICE'] == 'cpu':\n",
@ -1014,7 +1015,7 @@
    "D = 4  # input dimensionality\n",
    "H = 10 # number of hidden layer neurons\n",
    "\n",
-    "observations = C.input(STATE_COUNT, np.float32, name=\"obs\")\n",
+    "observations = input(STATE_COUNT, np.float32, name=\"obs\")\n",
    "\n",
    "W1 = C.parameter(shape=(STATE_COUNT, H), init=C.glorot_uniform(), name=\"W1\")\n",
    "b1 = C.parameter(shape=H, name=\"b1\")\n",
@ -1106,8 +1107,8 @@
    }
   ],
   "source": [
-    "input_y = C.input(1, np.float32, name=\"input_y\")\n",
-    "advantages = C.input(1, np.float32, name=\"advt\")\n",
+    "input_y = input(1, np.float32, name=\"input_y\")\n",
+    "advantages = input(1, np.float32, name=\"advt\")\n",
    "\n",
    "loss = -C.reduce_mean(C.log(C.square(input_y - probability) + 1e-4) * advantages, axis=0, name='loss')\n",
    "\n",
--- a/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
+++ b/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb
@ -184,7 +184,7 @@
    "from cntk import Trainer, Axis\n",
    "from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT\n",
    "from cntk.learners import momentum_sgd, fsadagrad, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType\n",
-    "from cntk import input, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, \\\n",
+    "from cntk import input, cross_entropy_with_softmax, classification_error, sequence, \\\n",
    "                element_select, alias, hardmax, placeholder_variable, combine, parameter, times, plus\n",
    "from cntk.ops.functions import CloneMethod, load_model, Function\n",
    "from cntk.initializer import glorot_uniform\n",
--- a/Tutorials/CNTK_207_Training_with_Sampled_Softmax.ipynb
+++ b/Tutorials/CNTK_207_Training_with_Sampled_Softmax.ipynb
@ -281,7 +281,7 @@
    "    indices = np.random.choice(\n",
    "        range(Param.num_classes),\n",
    "        size=num_vectors, \n",
-    "        p = data_sampling_distribution()).reshape((1, num_vectors))\n",
+    "        p = data_sampling_distribution()).reshape((num_vectors, 1))\n",
    "    list_of_vectors = C.Value.one_hot(indices, Param.num_classes)\n",
    "    return (list_of_vectors, indices.flatten())\n",
    "\n",
@ -376,7 +376,7 @@
    "    vectors, indices = get_random_one_hot_data(Param.test_set_size)\n",
    "    total_cross_entropy = 0.0\n",
    "    arguments = (vectors)\n",
-    "    z = softmax_input.eval(arguments)[0].reshape(Param.test_set_size, Param.num_classes)\n",
+    "    z = softmax_input.eval(arguments).reshape(Param.test_set_size, Param.num_classes)\n",
    "\n",
    "    for i in range(len(indices)):\n",
    "        log_p = log_softmax(z[i], indices[i])\n",
--- a/Tutorials/CNTK_599A_Sequence_To_Sequence.ipynb
+++ b/Tutorials/CNTK_599A_Sequence_To_Sequence.ipynb
@ -168,7 +168,7 @@
    "from cntk import Trainer, Axis\n",
    "from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT\n",
    "from cntk.learners import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType\n",
-    "from cntk import input, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, \\\n",
+    "from cntk import input, cross_entropy_with_softmax, classification_error, sequence, element_select, \\\n",
    "                 alias, hardmax, placeholder, combine, parameter, plus, times\n",
    "from cntk.ops.functions import CloneMethod\n",
    "from cntk.layers import LSTM, Stabilizer\n",
@ -499,7 +499,7 @@
   },
   "outputs": [],
   "source": [
-    "def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):\n",
+    "def LSTM_layer(input, output_dim, recurrence_hook_h=sequence.past_value, recurrence_hook_c=sequence.past_value):\n",
    "    # we first create placeholders for the hidden state and cell state which we don't have yet\n",
    "    dh = placeholder(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
    "    dc = placeholder(shape=(output_dim), dynamic_axes=input.dynamic_axes)\n",
@ -531,7 +531,7 @@
    "\n",
    "We will use the LSTM recurrence that we defined just above. Remember that its function signature is:\n",
    "\n",
-    "`def LSTM_layer(input, output_dim, recurrence_hook_h=past_value, recurrence_hook_c=past_value):`\n",
+    "`def LSTM_layer(input, output_dim, recurrence_hook_h=sequence.past_value, recurrence_hook_c=sequence.past_value):`\n",
    "\n",
    "and it returns a tuple `(hidden_state, hidden_cell)`. We will complete the following four exercises below. If possible, try them out before looking at the answers.\n",
    "\n",
@ -571,7 +571,7 @@
    "\n",
    "# 4.\n",
    "# Reverse the order of the input_sequence (this has been shown to help especially in machine translation)\n",
-    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim, future_value, future_value)"
+    "(encoder_output_h, encoder_output_c) = LSTM_layer(input_sequence, hidden_dim, sequence.future_value, sequence.future_value)"
   ]
  },
  {
@ -595,7 +595,7 @@
   },
   "outputs": [],
   "source": [
-    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(label_sequence))"
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, sequence.past_value(label_sequence))"
   ]
  },
  {
@ -618,7 +618,7 @@
   "outputs": [],
   "source": [
    "(output_h, output_c) = LSTM_layer(input_sequence, hidden_dim,\n",
-    "                                  recurrence_hook_h=past_value, recurrence_hook_c=past_value)"
+    "                                  recurrence_hook_h=sequence.past_value, recurrence_hook_c=sequence.past_value)"
   ]
  },
  {
@ -647,8 +647,8 @@
    "# 1.\n",
    "# Create the recurrence hooks for the decoder LSTM.\n",
    "\n",
-    "recurrence_hook_h = lambda operand: element_select(is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
-    "recurrence_hook_c = lambda operand: element_select(is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "recurrence_hook_h = lambda operand: element_select(is_first_label, thought_vector_broadcast_h, sequence.past_value(operand))\n",
+    "recurrence_hook_c = lambda operand: element_select(is_first_label, thought_vector_broadcast_c, sequence.past_value(operand))\n",
    "\n",
    "# 2.\n",
    "# With your recurrence hooks, create the decoder.\n",
@ -663,13 +663,13 @@
    "decoder_output_h = alias(decoder_input)\n",
    "for i in range(0, num_layers):\n",
    "    if (i > 0):\n",
-    "        recurrence_hook_h = past_value\n",
-    "        recurrence_hook_c = past_value\n",
+    "        recurrence_hook_h = sequence.past_value\n",
+    "        recurrence_hook_c = sequence.past_value\n",
    "    else:\n",
    "        recurrence_hook_h = lambda operand: element_select(\n",
-    "            is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "            is_first_label, thought_vector_broadcast_h, sequence.past_value(operand))\n",
    "        recurrence_hook_c = lambda operand: element_select(\n",
-    "            is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "            is_first_label, thought_vector_broadcast_c, sequence.past_value(operand))\n",
    "\n",
    "    (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim,\n",
    "                                                      recurrence_hook_h, recurrence_hook_c)"
@ -749,7 +749,7 @@
    "    encoder_output_h = stabilize(input_sequence)\n",
    "    for i in range(0, num_layers):\n",
    "        (encoder_output_h, encoder_output_c) = LSTM_layer(\n",
-    "            encoder_output_h.output, hidden_dim, future_value, future_value)\n",
+    "            encoder_output_h.output, hidden_dim, sequence.future_value, sequence.future_value)\n",
    "\n",
    "    # Prepare encoder output to be used in decoder\n",
    "    thought_vector_h = sequence.first(encoder_output_h)\n",
@ -763,19 +763,19 @@
    "    # Decoder\n",
    "    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
    "\n",
-    "    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(\n",
+    "    decoder_input = element_select(is_first_label, label_sentence_start_scattered, sequence.past_value(\n",
    "        decoder_history_hook))\n",
    "\n",
    "    decoder_output_h = stabilize(decoder_input)\n",
    "    for i in range(0, num_layers):\n",
    "        if (i > 0):\n",
-    "            recurrence_hook_h = past_value\n",
-    "            recurrence_hook_c = past_value\n",
+    "            recurrence_hook_h = sequence.past_value\n",
+    "            recurrence_hook_c = sequence.past_value\n",
    "        else:\n",
    "            recurrence_hook_h = lambda operand: element_select(\n",
-    "                is_first_label, thought_vector_broadcast_h, past_value(operand))\n",
+    "                is_first_label, thought_vector_broadcast_h, sequence.past_value(operand))\n",
    "            recurrence_hook_c = lambda operand: element_select(\n",
-    "                is_first_label, thought_vector_broadcast_c, past_value(operand))\n",
+    "                is_first_label, thought_vector_broadcast_c, sequence.past_value(operand))\n",
    "\n",
    "        (decoder_output_h, decoder_output_c) = LSTM_layer(\n",
    "            decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c)\n",
@ -936,7 +936,7 @@
   "outputs": [],
   "source": [
    "decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence\n",
-    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook))"
+    "decoder_input = element_select(is_first_label, label_sentence_start_scattered, sequence.past_value(decoder_history_hook))"
   ]
  },
  {
--- a/bindings/python/cntk/internal/sanitize.py
+++ b/bindings/python/cntk/internal/sanitize.py
@ -493,8 +493,8 @@ def memoize(func):

@memoize
 def _sparse_to_dense_network_cache(input_shape):
-    from cntk.ops import times, input
+    from cntk.ops import times, sequence

-    temp_input = input(input_shape)
+    temp_input = sequence.input(input_shape)
    eye_shape = input_shape[-1]
    return times(temp_input, np.eye(eye_shape))
--- a/bindings/python/cntk/layers/blocks.py
+++ b/bindings/python/cntk/layers/blocks.py
@ -14,7 +14,7 @@ import numpy as np
 from cntk import input, placeholder, combine, alias, sequence, parameter, constant
 from cntk.variables import Record, Constant, Parameter
 from cntk.axis import Axis
-from cntk.ops import times, slice, sigmoid, tanh, log, exp, softplus, past_value, future_value
+from cntk.ops import times, slice, sigmoid, tanh, log, exp, softplus
 from .typing import Signature
 from cntk.internal import _as_tuple
 from cntk.initializer import glorot_uniform
@ -96,7 +96,7 @@ def ForwardDeclaration(name='forward_declaration'):
     >>> x = C.input(**Sequence[Tensor[2]])
     >>> ones_like_input = sequence.broadcast_as(1, x)  # sequence of scalar ones of same length as input
     >>> out_fwd = ForwardDeclaration()  # placeholder for the state variables
-     >>> out = past_value(out_fwd, initial_state=0) + ones_like_input
+     >>> out = sequence.past_value(out_fwd, initial_state=0) + ones_like_input
     >>> out_fwd.resolve_to(out)
     >>> length = sequence.last(out)
     >>> x0 = np.reshape(np.arange(6,dtype=np.float32),(1,3,2))
--- a/bindings/python/cntk/layers/layers.py
+++ b/bindings/python/cntk/layers/layers.py
@ -208,9 +208,9 @@ def _window(x, axis, begin, end, step, stride, initial_state=None):
    helper to expand a sequence into a window, splicing them along the given axis (which must already exist)
    '''
    shifted = [
-        past_value(x, initial_state=initial_state, time_step=-t) if t < 0 else
+        sequence.past_value(x, initial_state=initial_state, time_step=-t) if t < 0 else
        x                                                        if t == 0 else
-        future_value(x, initial_state=initial_state, time_step=t)
+        sequence.future_value(x, initial_state=initial_state, time_step=t)
        for t in range(begin, end, step)
    ]
    r = splice(*shifted, axis=axis)
--- a/bindings/python/cntk/layers/sequence.py
+++ b/bindings/python/cntk/layers/sequence.py
@ -7,7 +7,7 @@
 # sequence -- first/higher-order functions over sequences, like Recurrence()

 from ..variables import Record
-from ..ops import combine, past_value, future_value, splice, sequence
+from ..ops import combine, splice, sequence
 from .blocks import *
 from .blocks import _get_initial_state_or_default, _inject_name

@ -193,7 +193,7 @@ def RecurrenceFrom(step_function, go_backwards=default_override_or(False), retur
     >>> decoder = RecurrenceFrom(LSTM(500))       # decoder starts from a data-dependent initial state, hence -From()
     >>> emit = Dense(30000)
     >>> h, c = encoder(embed(en)).outputs         # LSTM encoder has two outputs (h, c)
-     >>> z = emit(decoder(h, c, past_value(fr)))   # decoder takes encoder outputs as initial state
+     >>> z = emit(decoder(h, c, sequence.past_value(fr)))   # decoder takes encoder outputs as initial state
     >>> loss = C.cross_entropy_with_softmax(z, fr)

    Args:
@ -570,7 +570,7 @@ def UnfoldFrom(generator_function, until_predicate=None, length_increase=1, name

        # apply until_predicate if given
        if until_predicate is not None:
-            valid_frames = Recurrence(lambda h, x: (1-past_value(x)) * h, initial_state=1, name='valid_frames')(until_predicate(output))
+            valid_frames = Recurrence(lambda h, x: (1-sequence.past_value(x)) * h, initial_state=1, name='valid_frames')(until_predicate(output))
            output = sequence.gather(output, valid_frames, name='valid_output')

        return output
--- a/bindings/python/cntk/logging/progress_print.py
+++ b/bindings/python/cntk/logging/progress_print.py
@ -435,7 +435,7 @@ class TensorBoardProgressWriter(cntk_py.ProgressWriter):
        if freq is None:
            freq = sys.maxsize

-        super(TensorBoardProgressWriter, self).__init__(freq, 0, sys.maxsize, 0)
+        super(TensorBoardProgressWriter, self).__init__(freq, 0, sys.maxsize, 0, sys.maxsize, 0)

        # Only log either when rank is not specified or when rank is 0.
        self.writer = cntk_py.TensorBoardFileWriter(log_dir, model) if not rank else None
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -1633,6 +1633,8 @@ def element_select(flag, value_if_true, value_if_false, name=''):
@typemap
 def future_value(x, initial_state=None, time_step=1, name=''):
    '''
+    DEPRECATED.
+
    This function returns the future value w.r.t. ``x``. It is most often used when
    creating RNNs. The resulting tensor has the same shape as the input but is
    the next logical sample. The ``time_step`` parameter is the number of steps
@ -1674,20 +1676,18 @@ def future_value(x, initial_state=None, time_step=1, name=''):
        :class:`~cntk.ops.functions.Function`
    '''

-    from cntk.internal import sanitize_dtype_cntk
-    from ..cntk_py import Constant
-    from cntk.cntk_py import future_value
+    import warnings
+    warnings.warn('This will be removed in future versions. Please use '
+            'sequence.future_value() instead.', DeprecationWarning)

-    if initial_state is None:
-        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
-
-    x = sanitize_input(x)
-    return future_value(x, initial_state, time_step, name)
+    return sequence.future_value(x, initial_state, time_step, name)


@typemap
 def past_value(x, initial_state=None, time_step=1, name=''):
    '''
+    DEPRECATED.
+
    This function returns the past value w.r.t. ``x``. It is most often used when
    creating RNNs. The resulting tensor has the same shape as the input but is
    the previous logical sample. The ``time_step`` parameter is the number of steps
@ -1774,17 +1774,11 @@ def past_value(x, initial_state=None, time_step=1, name=''):
        :class:`~cntk.ops.functions.Function`
    '''

-    from cntk.internal import sanitize_dtype_cntk
-    from ..cntk_py import Constant
-    from cntk.cntk_py import past_value
+    import warnings
+    warnings.warn('This will be removed in future versions. Please use '
+            'sequence.past_value() instead.', DeprecationWarning)

-    if initial_state is None:
-        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
-    else:
-        initial_state = sanitize_input(initial_state)
-
-    x = sanitize_input(x)
-    return past_value(x, initial_state, time_step, name)
+    return sequence.past_value(x, initial_state, time_step, name)


 # TODO: does this belong into .sequence?
--- a/bindings/python/cntk/ops/sequence/init.py
+++ b/bindings/python/cntk/ops/sequence/init.py
@ -38,9 +38,166 @@ def input(shape, dtype=default_override_or(np.float32), needs_gradient=False, is
 # sequence ops
 ##########################################################################

+@typemap
+def future_value(x, initial_state=None, time_step=1, name=''):
+    '''
+    This function returns the future value w.r.t. ``x``. It is most often used when
+    creating RNNs. The resulting tensor has the same shape as the input but is
+    the next logical sample. The ``time_step`` parameter is the number of steps
+    to look into the future and is 1 by default. If there is no future value (i.e.
+    the current sample is the last one in the tensor) then the ``initial_state``
+    value is returned.
+
+    The initial state can be a constant (scalar or tensor), a learnable tensor
+    or input data (which has a batch dimension, as needed for sequence-to-sequence models).
+
+    Example:
+        >>> x = C.sequence.input(shape=(3,2))
+        >>> # Create one sequence with 4 tensors of shape (3, 2)
+        >>> x0 = np.reshape(np.arange(24,dtype=np.float32),(1,4,3,2))
+        >>> y = C.sequence.future_value(x) # using initial state of 0 by default
+        >>> y.eval({x:x0})
+        [array([[[  6.,   7.],
+                 [  8.,   9.],
+                 [ 10.,  11.]],
+        <BLANKLINE>
+                [[ 12.,  13.],
+                 [ 14.,  15.],
+                 [ 16.,  17.]],
+        <BLANKLINE>
+                [[ 18.,  19.],
+                 [ 20.,  21.],
+                 [ 22.,  23.]],
+        <BLANKLINE>
+                [[  0.,   0.],
+                 [  0.,   0.],
+                 [  0.,   0.]]], dtype=float32)]
+
+    Args:
+        x: the tensor (or its name) from which the future value is obtained.
+        initial_state: tensor or scalar representing the initial value to be used when the input tensor is shifted in time.
+        time_step (int): the number of time steps to look into the future (default 1)
+        name (str, optional): the name of the Function instance in the network
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+
+    from cntk.internal import sanitize_dtype_cntk
+    from ...cntk_py import Constant
+    from cntk.cntk_py import future_value
+
+    if initial_state is None:
+        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
+
+    x = sanitize_input(x)
+    return future_value(x, initial_state, time_step, name)
+
+
+@typemap
+def past_value(x, initial_state=None, time_step=1, name=''):
+    '''
+    This function returns the past value w.r.t. ``x``. It is most often used when
+    creating RNNs. The resulting tensor has the same shape as the input but is
+    the previous logical sample. The ``time_step`` parameter is the number of steps
+    to look into the past and is 1 by default. If there is no past value (i.e.
+    the current sample is the first one in the tensor)  then the ``initial_state``
+    value is returned.
+
+    The initial state can be a constant (scalar or tensor), a learnable tensor
+    or input data (which has a batch dimension, as needed for sequence-to-sequence models).
+
+    Example:
+        >>> # create example input: one sequence with 4 tensors of shape (3, 2)
+        >>> from cntk.layers.typing import Tensor, Sequence
+        >>> x = C.sequence.input((3,2))
+        >>> x0 = np.reshape(np.arange(24,dtype=np.float32),(1,4,3,2))
+        >>> x0
+        array([[[[  0.,   1.],
+                 [  2.,   3.],
+                 [  4.,   5.]],
+        <BLANKLINE>
+                [[  6.,   7.],
+                 [  8.,   9.],
+                 [ 10.,  11.]],
+        <BLANKLINE>
+                [[ 12.,  13.],
+                 [ 14.,  15.],
+                 [ 16.,  17.]],
+        <BLANKLINE>
+                [[ 18.,  19.],
+                 [ 20.,  21.],
+                 [ 22.,  23.]]]], dtype=float32)
+
+        >>> # this demonstrates how past_value shifts the sequence by one, padding with initial_state
+        >>> y = C.sequence.past_value(x) # initial_state is 0 by default
+        >>> y.eval({x:x0})
+        [array([[[  0.,   0.],
+                 [  0.,   0.],
+                 [  0.,   0.]],
+        <BLANKLINE>
+                [[  0.,   1.],
+                 [  2.,   3.],
+                 [  4.,   5.]],
+        <BLANKLINE>
+                [[  6.,   7.],
+                 [  8.,   9.],
+                 [ 10.,  11.]],
+        <BLANKLINE>
+                [[ 12.,  13.],
+                 [ 14.,  15.],
+                 [ 16.,  17.]]], dtype=float32)]
+
+        >>> # here, we pass a the initial_state as input data (e.g. sequence-to-sequence)
+        >>> s = C.input((3,2))  # not a sequence, e.g. a final encoder hidden state
+        >>> s0 = np.reshape(np.arange(6,dtype=np.float32)/2,(1,3,2))
+        >>> s0
+        array([[[ 0. ,  0.5],
+                [ 1. ,  1.5],
+                [ 2. ,  2.5]]], dtype=float32)
+        >>> y = C.sequence.past_value(x, initial_state=s)
+        >>> y.eval({x:x0, s:s0}) # same as the previous example except for the first time step
+        [array([[[  0. ,   0.5],
+                 [  1. ,   1.5],
+                 [  2. ,   2.5]],
+        <BLANKLINE>
+                [[  0. ,   1. ],
+                 [  2. ,   3. ],
+                 [  4. ,   5. ]],
+        <BLANKLINE>
+                [[  6. ,   7. ],
+                 [  8. ,   9. ],
+                 [ 10. ,  11. ]],
+        <BLANKLINE>
+                [[ 12. ,  13. ],
+                 [ 14. ,  15. ],
+                 [ 16. ,  17. ]]], dtype=float32)]
+
+    Args:
+        x: the tensor (or its name) from which the past value is obtained
+        initial_state: tensor or scalar representing the initial value to be used when the input tensor is shifted in time.
+        time_step (int): the number of time steps to look into the past (default 1)
+        name (str, optional): the name of the Function instance in the network
+
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+
+    from cntk.internal import sanitize_dtype_cntk
+    from ...cntk_py import Constant
+    from cntk.cntk_py import past_value
+
+    if initial_state is None:
+        initial_state = Constant.scalar(sanitize_dtype_cntk(np.float32), 0.0)
+    else:
+        initial_state = sanitize_input(initial_state)
+
+    x = sanitize_input(x)
+    return past_value(x, initial_state, time_step, name)
+
+
 def delay(x, initial_state=None, time_step=1, name=''):
    '''
-    This function combines :func:`~cntk.ops.past_value` and :func:`~cntk.ops.future_value` into a single function.
+    This function combines :func:`~cntk.ops.sequence.past_value` and :func:`~cntk.ops.sequence.future_value` into a single function.
    This is useful when the time_step is computed and can be positive, negative, or 0.

    Args:
@ -49,7 +206,7 @@ def delay(x, initial_state=None, time_step=1, name=''):
        time_step (int): the number of time steps to look into the past, where negative values mean to look into the future, and 0 means a no-op (default 1).
        name (str, optional): the name of the Function instance in the network
    '''
-    from ...ops import alias, past_value, future_value, element_select, element_divide, placeholder, exp 
+    from ...ops import alias, element_select, element_divide, placeholder, exp 
    if time_step > 0:
        return past_value  (x, time_step= time_step, initial_state=initial_state, name=name)
    elif time_step < 0:
@ -424,7 +581,7 @@ def reduce_max(x, name=''):
  Returns:
    The max value in the input sequence
  """
-  from ...ops import past_value, future_value, element_select, placeholder, greater 
+  from ...ops import element_select, placeholder, greater 
  m = placeholder(shape=(1,), dynamic_axes = x.dynamic_axes, name='max')
  o = element_select(greater(x, future_value(m)), x, future_value(m))
  rlt = o.replace_placeholders({m:sanitize_input(o)})
--- a/bindings/python/cntk/ops/tests/function_tests.py
+++ b/bindings/python/cntk/ops/tests/function_tests.py
@ -13,7 +13,7 @@ import pytest
 from ..functions import *
 from ...train.trainer import *
 from ...initializer import glorot_uniform
-from .. import constant, parameter, input, placeholder, times, plus, past_value, sequence, as_composite, combine, convolution, splice, as_block
+from .. import constant, parameter, input, placeholder, times, plus, sequence, as_composite, combine, convolution, splice, as_block
 from ... import InferredDimension, gpu, cpu
 from .ops_test_utils import compare_lists_of_np_arrays, AA, cntk_device

@ -190,7 +190,7 @@ def test_data_type_inference():
 def test_recurrence_shape_inference():
    i = sequence.input((2,))
    p = placeholder()
-    p_past = past_value(p)
+    p_past = sequence.past_value(p)
    p_past_plus_i = p_past + i

    p_past_plus_i.replace_placeholder(p_past_plus_i.output)
--- a/bindings/python/cntk/ops/tests/recurrent_test.py
+++ b/bindings/python/cntk/ops/tests/recurrent_test.py
@ -58,8 +58,7 @@ def test_op_future_value(input_size, time_step, initial_state, device_id, precis
    }
    init = parameter(init=AA(initial_state, dtype=dt), device=cntk_device(device_id))

-    from .. import future_value
-    input_op_input = future_value(a, init, time_step)
+    input_op_input = sequence.future_value(a, init, time_step)

    unittest_helper(input_op_input,
                x, expected_forward, expected_backward,
@ -95,8 +94,7 @@ def test_op_past_value(input_size, time_step, initial_state, device_id, precisio

    init = parameter(init=AA(initial_state, dtype=dt), device=cntk_device(device_id))

-    from .. import past_value
-    input_op_input = past_value(a, init, time_step)
+    input_op_input = sequence.past_value(a, init, time_step)

    unittest_helper(input_op_input,
                x, expected_forward, expected_backward,
--- a/bindings/python/cntk/ops/tests/reshaping_test.py
+++ b/bindings/python/cntk/ops/tests/reshaping_test.py
@ -513,7 +513,7 @@ def test_op_broadcast_as(device_id, precision):


 def test_op_broadcast_as_in_loop(device_id):
-    from .. import sequence, placeholder, past_value, input
+    from .. import sequence, placeholder, input

    a_data = [AA([1]), AA([2]), AA([3])]
    b_data = [AA([[2]]), AA([[2], [3]]), AA([[2], [3], [4]])]
@ -522,7 +522,7 @@ def test_op_broadcast_as_in_loop(device_id):
    b = sequence.input(shape=(1,), name='b')

    out_placeholder = placeholder()
-    out_delayed = past_value(out_placeholder, time_step=5)
+    out_delayed = sequence.past_value(out_placeholder, time_step=5)
    out_delayed_plus_b = out_delayed + b
    out = sequence.broadcast_as(a, out_delayed_plus_b)
    out.replace_placeholder(out)
--- a/bindings/python/cntk/ops/tests/sparse_test.py
+++ b/bindings/python/cntk/ops/tests/sparse_test.py
@ -135,7 +135,7 @@ def test_training_3d_sparse_sequence_with_recurrence(device_id):
        a_projection = times(a, w_i)

        p_o = C.placeholder()
-        h = C.past_value(p_o)
+        h = C.sequence.past_value(p_o)
        w_h = C.parameter(init=w_init_h, device=dev)
        h_projection = times(h, w_h)        
        z = a_projection + h_projection
--- a/bindings/python/cntk/tests/function_test.py
+++ b/bindings/python/cntk/tests/function_test.py
@ -10,7 +10,7 @@ from cntk import *

 def test_outputs():
    fwd_state = placeholder("placeholder")
-    prev_state = past_value(fwd_state, name="prev_state")
+    prev_state = sequence.past_value(fwd_state, name="prev_state")
    z = abs(prev_state, "abs")
    output = z.output
    z = z.replace_placeholders({fwd_state: z.output})
--- a/bindings/python/doc/concepts.rst
+++ b/bindings/python/doc/concepts.rst
@ -71,7 +71,7 @@ introduced, overloaded operators can be applied to them to form an operator grap
  import cntk as C

  # Create an input with the shape (2,3,*)
-  >>> x = C.input((2,3), name='features')
+  >>> x = C.sequence.input((2,3), name='features')

  # Create a constant scalar with value 2
  >>> c = C.constant(value=2)
@ -80,29 +80,29 @@ introduced, overloaded operators can be applied to them to form an operator grap
  >>> w = C.parameter((2,3))

  # Set up some test input data to check the operators.
-  # We specify a full batch having one element, which is a
+  # We specify a full batch having a sequence with one element, which is a
  # (2,3) matrix.
-  >>> test_input = [ np.asarray([[10,20,30],[40,50,60]]) ]
+  >>> test_input = [[ np.asarray([[10,20,30],[40,50,60]]) ]]

  # Elementwise multiplication operation
  >>> op  = x * c

  # Evaluate the op using test_input
  >>> print(op.eval({ x: test_input }))
-  array([[[  20.,   40.,   60.],
-          [  80.,  100.,  120.]]], dtype=float32)
+  [array([[[  20.,   40.,   60.],
+          [  80.,  100.,  120.]]], dtype=float32)]

  # Same as above (2 will be converted to constant)
  >>> op2 = x * 2
  >>> print(op2.eval({ x: test_input }))
-  array([[[  20.,   40.,   60.],
-          [  80.,  100.,  120.]]], dtype=float32)
+  [array([[[  20.,   40.,   60.],
+          [  80.,  100.,  120.]]], dtype=float32)]

  #  Elementwise multiplication of two 2x3 matrices
  >>> op3 = x * [[1,2,3], [4,5,6]]
  >>> print(op3.eval({ x: test_input}))
-  array([[[  10.,   40.,   90.],
-          [ 160.,  250.,  360.]]], dtype=float32)
+  [array([[[  10.,   40.,   90.],
+          [ 160.,  250.,  360.]]], dtype=float32)]


 Broadcasting
--- a/bindings/python/doc/layerref.rst
+++ b/bindings/python/doc/layerref.rst
@ -866,8 +866,8 @@ end with a zero:
 Notes
 ~~~~~

-This layer is a wrapper around the ``past_value()`` and
-``future_value()`` primitives.
+This layer is a wrapper around the ``sequence.past_value()`` and
+``sequence.future_value()`` primitives.

 Example
 ~~~~~~~