Merge branch 'master' of ssh://github.com/BVLC/caffe into bvlc

2016-07-14 14:58:11 -07:00 · 2016-07-14 14:58:11 -07:00 · 8b5ba8c165
--- a/.travis.yml
+++ b/.travis.yml
@ -1,40 +1,53 @@
-# Use a build matrix to do two builds in parallel:
-# one using CMake, and one using make.
-env:
-  matrix:
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3
-    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
+dist: trusty
+sudo: required

 language: cpp
-
-# Cache Ubuntu apt packages.
-cache:
-  apt: true
-  directories:
-  - /home/travis/miniconda
-  - /home/travis/miniconda2
-  - /home/travis/miniconda3
-
 compiler: gcc

+env:
+  global:
+    - NUM_THREADS=4
+  matrix:
+    # Use a build matrix to test many builds in parallel
+    # envvar defaults:
+    #   WITH_CMAKE: false
+    #   WITH_PYTHON3: false
+    #   WITH_IO: true
+    #   WITH_CUDA: false
+    #   WITH_CUDNN: false
+    - BUILD_NAME="default-make"
+#   - BUILD_NAME="python3-make" WITH_PYTHON3=true
+    - BUILD_NAME="no-io-make" WITH_IO=false
+    - BUILD_NAME="cuda-make" WITH_CUDA=true
+    - BUILD_NAME="cudnn-make" WITH_CUDA=true WITH_CUDNN=true
+
+    - BUILD_NAME="default-cmake" WITH_CMAKE=true
+    - BUILD_NAME="python3-cmake" WITH_CMAKE=true WITH_PYTHON3=true
+    - BUILD_NAME="no-io-cmake" WITH_CMAKE=true WITH_IO=false
+    - BUILD_NAME="cuda-cmake" WITH_CMAKE=true WITH_CUDA=true
+    - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDA=true WITH_CUDNN=true
+
+cache:
+  timeout: 604800  # 1 week
+  apt: true
+  directories:
+    - ~/protobuf3
+
 before_install:
-  - export NUM_THREADS=4
-  - export SCRIPTS=./scripts/travis
-  - export CONDA_DIR="/home/travis/miniconda$PYTHON_VERSION"
+  - source ./scripts/travis/defaults.sh

 install:
-  - sudo -E $SCRIPTS/travis_install.sh
+  - sudo -E ./scripts/travis/install-deps.sh
+  - ./scripts/travis/setup-venv.sh ~/venv
+  - source ~/venv/bin/activate
+  - ./scripts/travis/install-python-deps.sh

 before_script:
-  - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64:$CONDA_DIR/lib
-  - export PATH=$CONDA_DIR/bin:$PATH
-  - if ! $WITH_CMAKE; then $SCRIPTS/travis_setup_makefile_config.sh; fi
+  - ./scripts/travis/configure.sh

-script: $SCRIPTS/travis_build_and_test.sh
+script:
+  - ./scripts/travis/build.sh
+  - ./scripts/travis/test.sh

 notifications:
 # Emails are sent to the committer's git-configured email address by default,
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)

 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc3")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc3")
+set(CAFFE_TARGET_VERSION "1.0.0-rc3" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0-rc3" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})

 # ---[ Using cmake scripts and modules
--- a/2
+++ b/2
@ -272,7 +272,7 @@ endif
 ifeq ($(OSX), 1)
 	CXX := /usr/bin/clang++
 	ifneq ($(CPU_ONLY), 1)
-		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | grep -o '[0-9.]*')
+		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]')
 		ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
 			CXXFLAGS += -stdlib=libstdc++
 			LINKFLAGS += -stdlib=libstdc++
--- a/data/cifar10/get_cifar10.sh
+++ b/data/cifar10/get_cifar10.sh
@ -2,7 +2,7 @@
 # This scripts downloads the CIFAR10 (binary version) data and unzips it.

 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"

 echo "Downloading..."

--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@ -8,7 +8,7 @@
 # - the training splits with labels

 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"

 echo "Downloading..."

--- a/data/mnist/get_mnist.sh
+++ b/data/mnist/get_mnist.sh
@ -2,7 +2,7 @@
 # This scripts downloads the mnist data and unzips it.

 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"

 echo "Downloading..."

--- a/docker/Makefile
+++ b/docker/Makefile
@ -22,7 +22,7 @@ docker_files: standalone_files

 standalone_files: standalone/cpu/Dockerfile standalone/gpu/Dockerfile

-FROM_GPU = "nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04"
+FROM_GPU = "nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04"
 FROM_CPU = "ubuntu:14.04"
 GPU_CMAKE_ARGS = -DUSE_CUDNN=1
 CPU_CMAKE_ARGS = -DCPU_ONLY=1
--- a/docker/standalone/gpu/Dockerfile
+++ b/docker/standalone/gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04
+FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER caffe-maint@googlegroups.com

 RUN apt-get update && apt-get install -y --no-install-recommends \
--- a/docs/installation.md
+++ b/docs/installation.md
@ -40,14 +40,14 @@ Optional dependencies:

 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v4)
+* cuDNN for GPU acceleration (v5)

 Pycaffe and Matcaffe interfaces have their own natural needs.

 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.

-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v4; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v5; older versions are supported in older Caffe.

 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.

--- a/examples/cifar10/create_cifar10.sh
+++ b/examples/cifar10/create_cifar10.sh
@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the cifar data into leveldb format.
+set -e

 EXAMPLE=examples/cifar10
 DATA=data/cifar10
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@ -1,16 +1,17 @@
 #!/usr/bin/env sh
+set -e

 TOOLS=./build/tools

 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_solver.prototxt $@

 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
    --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@

 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
    --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
--- a/examples/cifar10/train_full_sigmoid.sh
+++ b/examples/cifar10/train_full_sigmoid.sh
@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e

 TOOLS=./build/tools

 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt $@

--- a/examples/cifar10/train_full_sigmoid_bn.sh
+++ b/examples/cifar10/train_full_sigmoid_bn.sh
@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e

 TOOLS=./build/tools

 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt $@

--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@ -1,11 +1,12 @@
 #!/usr/bin/env sh
+set -e

 TOOLS=./build/tools

 $TOOLS/caffe train \
-  --solver=examples/cifar10/cifar10_quick_solver.prototxt
+  --solver=examples/cifar10/cifar10_quick_solver.prototxt $@

 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
  --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
--- a/examples/imagenet/create_imagenet.sh
+++ b/examples/imagenet/create_imagenet.sh
@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # Create the imagenet lmdb inputs
 # N.B. set the path to the imagenet train + val data dirs
+set -e

 EXAMPLE=examples/imagenet
 DATA=data/ilsvrc12
--- a/examples/imagenet/resume_training.sh
+++ b/examples/imagenet/resume_training.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env sh
+set -e

 ./build/tools/caffe train \
    --solver=models/bvlc_reference_caffenet/solver.prototxt \
-    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5
+    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5 \
+    $@
--- a/examples/imagenet/train_caffenet.sh
+++ b/examples/imagenet/train_caffenet.sh
@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e

 ./build/tools/caffe train \
-    --solver=models/bvlc_reference_caffenet/solver.prototxt
+    --solver=models/bvlc_reference_caffenet/solver.prototxt $@
--- a/examples/images/cat
+++ b/examples/images/cat
--- a/examples/mnist/create_mnist.sh
+++ b/examples/mnist/create_mnist.sh
@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into lmdb/leveldb format,
 # depending on the value assigned to $BACKEND.
+set -e

 EXAMPLE=examples/mnist
 DATA=data/mnist
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e

-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt $@
--- a/examples/mnist/train_lenet_adam.sh
+++ b/examples/mnist/train_lenet_adam.sh
@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e

-./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt $@
--- a/examples/mnist/train_lenet_consolidated.sh
+++ b/examples/mnist/train_lenet_consolidated.sh
@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e

 ./build/tools/caffe train \
-  --solver=examples/mnist/lenet_consolidated_solver.prototxt
+  --solver=examples/mnist/lenet_consolidated_solver.prototxt $@
--- a/examples/mnist/train_lenet_rmsprop.sh
+++ b/examples/mnist/train_lenet_rmsprop.sh
@ -1,3 +1,5 @@
 #!/usr/bin/env sh
+set -e

-./build/tools/caffe train --solver=examples/mnist/lenet_solver_rmsprop.prototxt
+./build/tools/caffe train \
+    --solver=examples/mnist/lenet_solver_rmsprop.prototxt $@
--- a/examples/mnist/train_mnist_autoencoder.sh
+++ b/examples/mnist/train_mnist_autoencoder.sh
@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e

 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver.prototxt $@
--- a/examples/mnist/train_mnist_autoencoder_adadelta.sh
+++ b/examples/mnist/train_mnist_autoencoder_adadelta.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -e

 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt $@
--- a/examples/mnist/train_mnist_autoencoder_adagrad.sh
+++ b/examples/mnist/train_mnist_autoencoder_adagrad.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -e

 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt $@
--- a/examples/mnist/train_mnist_autoencoder_nesterov.sh
+++ b/examples/mnist/train_mnist_autoencoder_nesterov.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -e

 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt $@
--- a/examples/net_surgery.ipynb
+++ b/examples/net_surgery.ipynb
@ -22,7 +22,6 @@
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
-    "import Image\n",
    "\n",
    "# Make sure that caffe is on the python path:\n",
    "caffe_root = '../'  # this file is expected to be in {caffe_root}/examples\n",
@ -3511,7 +3510,7 @@
    "print(\"blobs {}\\nparams {}\".format(net.blobs.keys(), net.params.keys()))\n",
    "\n",
    "# load image and prepare as a single input batch for Caffe\n",
-    "im = np.array(Image.open('images/cat_gray.jpg'))\n",
+    "im = np.array(caffe.io.load_image('images/cat_gray.jpg', color=False)).squeeze()\n",
    "plt.title(\"original image\")\n",
    "plt.imshow(im)\n",
    "plt.axis('off')\n",
@ -4480,8 +4479,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "pre-surgery output mean -12.93\n",
-      "post-surgery output mean -11.93\n"
+      "pre-surgery output mean -0.02\n",
+      "post-surgery output mean 0.98\n"
     ]
    }
   ],
@ -4489,7 +4488,7 @@
    "# pick first filter output\n",
    "conv0 = net.blobs['conv'].data[0, 0]\n",
    "print(\"pre-surgery output mean {:.2f}\".format(conv0.mean()))\n",
-    "# set first filter bias to 10\n",
+    "# set first filter bias to 1\n",
    "net.params['conv'][1].data[0] = 1.\n",
    "net.forward()\n",
    "print(\"post-surgery output mean {:.2f}\".format(conv0.mean()))"
@ -5494,13 +5493,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1,2c1,2\r\n",
+      "1,2c1\r\n",
      "< # Fully convolutional network version of CaffeNet.\r\n",
      "< name: \"CaffeNetConv\"\r\n",
      "---\r\n",
      "> name: \"CaffeNet\"\r\n",
-      "> input: \"data\"\r\n",
-      "7,11c7\r\n",
+      "7,11c6\r\n",
      "<   input_param {\r\n",
      "<     # initial shape for a fully convolutional network:\r\n",
      "<     # the shape can be set for each input by reshape.\r\n",
@ -5508,33 +5506,33 @@
      "<   }\r\n",
      "---\r\n",
      ">   input_param { shape: { dim: 10 dim: 3 dim: 227 dim: 227 } }\r\n",
-      "157,158c153,154\r\n",
+      "157,158c152,153\r\n",
      "<   name: \"fc6-conv\"\r\n",
      "<   type: \"Convolution\"\r\n",
      "---\r\n",
      ">   name: \"fc6\"\r\n",
      ">   type: \"InnerProduct\"\r\n",
-      "160,161c156,157\r\n",
+      "160,161c155,156\r\n",
      "<   top: \"fc6-conv\"\r\n",
      "<   convolution_param {\r\n",
      "---\r\n",
      ">   top: \"fc6\"\r\n",
      ">   inner_product_param {\r\n",
-      "163d158\r\n",
+      "163d157\r\n",
      "<     kernel_size: 6\r\n",
-      "169,170c164,165\r\n",
+      "169,170c163,164\r\n",
      "<   bottom: \"fc6-conv\"\r\n",
      "<   top: \"fc6-conv\"\r\n",
      "---\r\n",
      ">   bottom: \"fc6\"\r\n",
      ">   top: \"fc6\"\r\n",
-      "175,176c170,171\r\n",
+      "175,176c169,170\r\n",
      "<   bottom: \"fc6-conv\"\r\n",
      "<   top: \"fc6-conv\"\r\n",
      "---\r\n",
      ">   bottom: \"fc6\"\r\n",
      ">   top: \"fc6\"\r\n",
-      "182,186c177,181\r\n",
+      "182,186c176,180\r\n",
      "<   name: \"fc7-conv\"\r\n",
      "<   type: \"Convolution\"\r\n",
      "<   bottom: \"fc6-conv\"\r\n",
@ -5546,21 +5544,21 @@
      ">   bottom: \"fc6\"\r\n",
      ">   top: \"fc7\"\r\n",
      ">   inner_product_param {\r\n",
-      "188d182\r\n",
+      "188d181\r\n",
      "<     kernel_size: 1\r\n",
-      "194,195c188,189\r\n",
+      "194,195c187,188\r\n",
      "<   bottom: \"fc7-conv\"\r\n",
      "<   top: \"fc7-conv\"\r\n",
      "---\r\n",
      ">   bottom: \"fc7\"\r\n",
      ">   top: \"fc7\"\r\n",
-      "200,201c194,195\r\n",
+      "200,201c193,194\r\n",
      "<   bottom: \"fc7-conv\"\r\n",
      "<   top: \"fc7-conv\"\r\n",
      "---\r\n",
      ">   bottom: \"fc7\"\r\n",
      ">   top: \"fc7\"\r\n",
-      "207,211c201,205\r\n",
+      "207,211c200,204\r\n",
      "<   name: \"fc8-conv\"\r\n",
      "<   type: \"Convolution\"\r\n",
      "<   bottom: \"fc7-conv\"\r\n",
@ -5572,9 +5570,9 @@
      ">   bottom: \"fc7\"\r\n",
      ">   top: \"fc8\"\r\n",
      ">   inner_product_param {\r\n",
-      "213d206\r\n",
+      "213d205\r\n",
      "<     kernel_size: 1\r\n",
-      "219c212\r\n",
+      "219c211\r\n",
      "<   bottom: \"fc8-conv\"\r\n",
      "---\r\n",
      ">   bottom: \"fc8\"\r\n"
@ -5610,13 +5608,6 @@
    }
   ],
   "source": [
-    "# Make sure that caffe is on the python path:\n",
-    "caffe_root = '../'  # this file is expected to be in {caffe_root}/examples\n",
-    "import sys\n",
-    "sys.path.insert(0, caffe_root + 'python')\n",
-    "\n",
-    "import caffe\n",
-    "\n",
    "# Load the original network and extract the fully connected layers' parameters.\n",
    "net = caffe.Net('../models/bvlc_reference_caffenet/deploy.prototxt', \n",
    "                '../models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel', \n",
--- a/examples/siamese/create_mnist_siamese.sh
+++ b/examples/siamese/create_mnist_siamese.sh
@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into leveldb format.
+set -e

 EXAMPLES=./build/examples/siamese
 DATA=./data/mnist
--- a/examples/siamese/train_mnist_siamese.sh
+++ b/examples/siamese/train_mnist_siamese.sh
@ -1,5 +1,6 @@
 #!/usr/bin/env sh
+set -e

 TOOLS=./build/tools

-$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt
+$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt $@
--- a/include/caffe/layers/cudnn_relu_layer.hpp
+++ b/include/caffe/layers/cudnn_relu_layer.hpp
@ -37,6 +37,7 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
  cudnnHandle_t             handle_;
  cudnnTensorDescriptor_t bottom_desc_;
  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif

--- a/include/caffe/layers/cudnn_sigmoid_layer.hpp
+++ b/include/caffe/layers/cudnn_sigmoid_layer.hpp
@ -37,6 +37,7 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
  cudnnHandle_t             handle_;
  cudnnTensorDescriptor_t bottom_desc_;
  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif

--- a/include/caffe/layers/cudnn_tanh_layer.hpp
+++ b/include/caffe/layers/cudnn_tanh_layer.hpp
@ -37,6 +37,7 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
  cudnnHandle_t             handle_;
  cudnnTensorDescriptor_t bottom_desc_;
  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
 };
 #endif

--- a/include/caffe/layers/lstm_layer.hpp
+++ b/include/caffe/layers/lstm_layer.hpp
@ -0,0 +1,154 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented by unrolling
+ *        the LSTM computation through time.
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LSTM_LAYER_HPP_
--- a/include/caffe/layers/recurrent_layer.hpp
+++ b/include/caffe/layers/recurrent_layer.hpp
@ -0,0 +1,187 @@
+#ifndef CAFFE_RECURRENT_LAYER_HPP_
+#define CAFFE_RECURRENT_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/format.hpp"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instead,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const {
+    int min_bottoms = 2;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> inputs;
+      this->RecurrentInputBlobNames(&inputs);
+      min_bottoms += inputs.size();
+    }
+    return min_bottoms;
+  }
+  virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
+  virtual inline int ExactNumTopBlobs() const {
+    int num_tops = 1;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> outputs;
+      this->RecurrentOutputBlobNames(&outputs);
+      num_tops += outputs.size();
+    }
+    return num_tops;
+  }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network architecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  /**
+   * @brief The last layer to run in the network. (Any later layers are losses
+   *        added to force the recurrent net to do backprop.)
+   */
+  int last_layer_index_;
+
+  /**
+   * @brief Whether the layer's hidden state at the first and last timesteps
+   *        are layer inputs and outputs, respectively.
+   */
+  bool expose_hidden_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RECURRENT_LAYER_HPP_
--- a/include/caffe/layers/rnn_layer.hpp
+++ b/include/caffe/layers/rnn_layer.hpp
@ -0,0 +1,47 @@
+#ifndef CAFFE_RNN_LAYER_HPP_
+#define CAFFE_RNN_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RNN_LAYER_HPP_
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@ -25,6 +25,7 @@ class Net {
 public:
  explicit Net(const NetParameter& param, const Net* root_net = NULL);
  explicit Net(const string& param_file, Phase phase,
+      const int level = 0, const vector<string>* stages = NULL,
      const Net* root_net = NULL);
  virtual ~Net() {}

--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@ -93,8 +93,13 @@ template <typename Dtype>
 inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
    int n, int c, int h, int w) {
  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      n, c, h, w));
+      CUDNN_TENSOR_NCHW, n, c, h, w));
+#else
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
+      CUDNN_TENSOR_NCHW, n, c, h, w));
+#endif
 }

 template <typename Dtype>
@ -125,8 +130,21 @@ inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
    LOG(FATAL) << "Unknown pooling method.";
  }
  CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-        pad_h, pad_w, stride_h, stride_w));
+#if CUDNN_VERSION_MIN(5, 0, 0)
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode,
+        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
+#else
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(*pool_desc, *mode,
+        CUDNN_PROPAGATE_NAN, h, w, pad_h, pad_w, stride_h, stride_w));
+#endif
+}
+
+template <typename Dtype>
+inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc,
+    cudnnActivationMode_t mode) {
+  CUDNN_CHECK(cudnnCreateActivationDescriptor(activ_desc));
+  CUDNN_CHECK(cudnnSetActivationDescriptor(*activ_desc, mode,
+                                           CUDNN_PROPAGATE_NAN, Dtype(0)));
 }

 }  // namespace cudnn
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -22,13 +22,19 @@ if(UNIX OR APPLE)
 endif()

 # ---[ Install
-file(GLOB files1 *.py requirements.txt)
-install(FILES ${files1} DESTINATION python)
+# scripts
+file(GLOB python_files *.py requirements.txt)
+install(FILES ${python_files} DESTINATION python)

-file(GLOB files2 caffe/*.py)
-install(FILES  ${files2} DESTINATION python/caffe)
+# module
+install(DIRECTORY caffe
+    DESTINATION python
+    FILES_MATCHING
+    PATTERN "*.py"
+    PATTERN "ilsvrc_2012_mean.npy"
+    PATTERN "test" EXCLUDE
+    )
+
+# _caffe.so
 install(TARGETS pycaffe  DESTINATION python/caffe)
-install(DIRECTORY caffe/imagenet caffe/proto caffe/test DESTINATION python/caffe)
-
-

--- a/python/caffe/init.py
+++ b/python/caffe/init.py
@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
+from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@ -51,6 +51,8 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }

+void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
+
 // For convenience, check that input files can be opened, and raise an
 // exception that boost will send to Python if not (caffe could still crash
 // later if the input files are disturbed before they are actually used, but
@ -86,19 +88,42 @@ void CheckContiguousArray(PyArrayObject* arr, string name,
  }
 }

-// Net constructor for passing phase as int
-shared_ptr<Net<Dtype> > Net_Init(
-    string param_file, int phase) {
-  CheckFile(param_file);
+// Net constructor
+shared_ptr<Net<Dtype> > Net_Init(string network_file, int phase,
+    const int level, const bp::object& stages,
+    const bp::object& weights) {
+  CheckFile(network_file);
+
+  // Convert stages from list to vector
+  vector<string> stages_vector;
+  if (!stages.is_none()) {
+    for (int i = 0; i < len(stages); i++) {
+      stages_vector.push_back(bp::extract<string>(stages[i]));
+    }
+  }
+
+  // Initialize net
+  shared_ptr<Net<Dtype> > net(new Net<Dtype>(network_file,
+        static_cast<Phase>(phase), level, &stages_vector));
+
+  // Load weights
+  if (!weights.is_none()) {
+    std::string weights_file_str = bp::extract<std::string>(weights);
+    CheckFile(weights_file_str);
+    net->CopyTrainedLayersFrom(weights_file_str);
+  }

-  shared_ptr<Net<Dtype> > net(new Net<Dtype>(param_file,
-      static_cast<Phase>(phase)));
  return net;
 }

-// Net construct-and-load convenience constructor
+// Legacy Net construct-and-load convenience constructor
 shared_ptr<Net<Dtype> > Net_Init_Load(
    string param_file, string pretrained_param_file, int phase) {
+  LOG(WARNING) << "DEPRECATION WARNING - deprecated use of Python interface";
+  LOG(WARNING) << "Use this instead (with the named \"weights\""
+    << " parameter):";
+  LOG(WARNING) << "Net('" << param_file << "', " << phase
+    << ", weights='" << pretrained_param_file << "')";
  CheckFile(param_file);
  CheckFile(pretrained_param_file);

@ -114,6 +139,14 @@ void Net_Save(const Net<Dtype>& net, string filename) {
  WriteProtoToBinaryFile(net_param, filename.c_str());
 }

+void Net_SaveHDF5(const Net<Dtype>& net, string filename) {
+  net.ToHDF5(filename);
+}
+
+void Net_LoadHDF5(Net<Dtype>* net, string filename) {
+  net->CopyTrainedLayersFromHDF5(filename.c_str());
+}
+
 void Net_SetInputArrays(Net<Dtype>* net, bp::object data_obj,
    bp::object labels_obj) {
  // check that this network has an input MemoryDataLayer
@ -220,6 +253,27 @@ bp::object BlobVec_add_blob(bp::tuple args, bp::dict kwargs) {
  return bp::object();
 }

+template<typename Dtype>
+class PythonCallback: public Solver<Dtype>::Callback {
+ protected:
+  bp::object on_start_, on_gradients_ready_;
+
+ public:
+  PythonCallback(bp::object on_start, bp::object on_gradients_ready)
+    : on_start_(on_start), on_gradients_ready_(on_gradients_ready) { }
+  virtual void on_gradients_ready() {
+    on_gradients_ready_();
+  }
+  virtual void on_start() {
+    on_start_();
+  }
+};
+template<typename Dtype>
+void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
+  bp::object on_gradients_ready) {
+  solver->add_callback(new PythonCallback<Dtype>(on_start, on_gradients_ready));
+}
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);

 BOOST_PYTHON_MODULE(_caffe) {
@ -231,17 +285,24 @@ BOOST_PYTHON_MODULE(_caffe) {
  // Caffe utility functions
  bp::def("set_mode_cpu", &set_mode_cpu);
  bp::def("set_mode_gpu", &set_mode_gpu);
+  bp::def("set_random_seed", &set_random_seed);
  bp::def("set_device", &Caffe::SetDevice);

  bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);

  bp::class_<Net<Dtype>, shared_ptr<Net<Dtype> >, boost::noncopyable >("Net",
    bp::no_init)
-    .def("__init__", bp::make_constructor(&Net_Init))
+    // Constructor
+    .def("__init__", bp::make_constructor(&Net_Init,
+          bp::default_call_policies(), (bp::arg("network_file"), "phase",
+            bp::arg("level")=0, bp::arg("stages")=bp::object(),
+            bp::arg("weights")=bp::object())))
+    // Legacy constructor
    .def("__init__", bp::make_constructor(&Net_Init_Load))
    .def("_forward", &Net<Dtype>::ForwardFromTo)
    .def("_backward", &Net<Dtype>::BackwardFromTo)
    .def("reshape", &Net<Dtype>::Reshape)
+    .def("clear_param_diffs", &Net<Dtype>::ClearParamDiffs)
    // The cast is to select a particular overload.
    .def("copy_from", static_cast<void (Net<Dtype>::*)(const string)>(
        &Net<Dtype>::CopyTrainedLayersFrom))
@ -267,7 +328,9 @@ BOOST_PYTHON_MODULE(_caffe) {
        bp::return_value_policy<bp::copy_const_reference>()))
    .def("_set_input_arrays", &Net_SetInputArrays,
        bp::with_custodian_and_ward<1, 2, bp::with_custodian_and_ward<1, 3> >())
-    .def("save", &Net_Save);
+    .def("save", &Net_Save)
+    .def("save_hdf5", &Net_SaveHDF5)
+    .def("load_hdf5", &Net_LoadHDF5);
  BP_REGISTER_SHARED_PTR_TO_PYTHON(Net<Dtype>);

  bp::class_<Blob<Dtype>, shared_ptr<Blob<Dtype> >, boost::noncopyable>(
@ -307,6 +370,7 @@ BOOST_PYTHON_MODULE(_caffe) {
    .add_property("test_nets", bp::make_function(&Solver<Dtype>::test_nets,
          bp::return_internal_reference<>()))
    .add_property("iter", &Solver<Dtype>::iter)
+    .def("add_callback", &Solver_add_callback<Dtype>)
    .def("solve", static_cast<void (Solver<Dtype>::*)(const char*)>(
          &Solver<Dtype>::Solve), SolveOverloads())
    .def("step", &Solver<Dtype>::Step)
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@ -83,7 +83,7 @@ class Detector(caffe.Net):
        for ix, window_in in enumerate(window_inputs):
            caffe_in[ix] = self.transformer.preprocess(in_, window_in)
        out = self.forward_all(**{in_: caffe_in})
-        predictions = out[self.outputs[0]].squeeze(axis=(2, 3))
+        predictions = out[self.outputs[0]]

        # Package predictions with images and windows.
        detections = []
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@ -127,7 +127,7 @@ def choose_color_by_layertype(layertype):
    return color


-def get_pydot_graph(caffe_net, rankdir, label_edges=True):
+def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
    """Create a data structure which represents the `caffe_net`.

    Parameters
@ -137,6 +137,9 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
        Direction of graph layout.
    label_edges : boolean, optional
        Label the edges (default is True).
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)

    Returns
    -------
@ -148,6 +151,19 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
    pydot_nodes = {}
    pydot_edges = []
    for layer in caffe_net.layer:
+        if phase is not None:
+          included = False
+          if len(layer.include) == 0:
+            included = True
+          if len(layer.include) > 0 and len(layer.exclude) > 0:
+            raise ValueError('layer ' + layer.name + ' has both include '
+                             'and exclude specified.')
+          for layer_phase in layer.include:
+            included = included or layer_phase.phase == phase
+          for layer_phase in layer.exclude:
+            included = included and not layer_phase.phase == phase
+          if not included:
+            continue
        node_label = get_layer_label(layer, rankdir)
        node_name = "%s_%s" % (layer.name, layer.type)
        if (len(layer.bottom) == 1 and len(layer.top) == 1 and
@ -186,7 +202,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
    return pydot_graph


-def draw_net(caffe_net, rankdir, ext='png'):
+def draw_net(caffe_net, rankdir, ext='png', phase=None):
    """Draws a caffe net and returns the image string encoded using the given
    extension.

@ -195,16 +211,19 @@ def draw_net(caffe_net, rankdir, ext='png'):
    caffe_net : a caffe.proto.caffe_pb2.NetParameter protocol buffer.
    ext : string, optional
        The image extension (the default is 'png').
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)

    Returns
    -------
    string :
        Postscript representation of the graph.
    """
-    return get_pydot_graph(caffe_net, rankdir).create(format=ext)
+    return get_pydot_graph(caffe_net, rankdir, phase=phase).create(format=ext)


-def draw_net_to_file(caffe_net, filename, rankdir='LR'):
+def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
    """Draws a caffe net, and saves it to file using the format given as the
    file extension. Use '.raw' to output raw text that you can manually feed
    to graphviz to draw graphs.
@ -216,7 +235,10 @@ def draw_net_to_file(caffe_net, filename, rankdir='LR'):
        The path to a file where the networks visualization will be stored.
    rankdir : {'LR', 'TB', 'BT'}
        Direction of graph layout.
+    phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
+        Include layers from this network phase.  If None, include all layers.
+        (the default is None)
    """
    ext = filename[filename.rfind('.')+1:]
    with open(filename, 'wb') as fid:
-        fid.write(draw_net(caffe_net, rankdir, ext))
+        fid.write(draw_net(caffe_net, rankdir, ext, phase))
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@ -46,7 +46,7 @@ def array_to_blobproto(arr, diff=None):
    return blob


-def arraylist_to_blobprotovecor_str(arraylist):
+def arraylist_to_blobprotovector_str(arraylist):
    """Converts a list of arrays to a serialized blobprotovec, which could be
    then passed to a network for processing.
    """
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@ -292,21 +292,31 @@ def _Net_batch(self, blobs):
                                                 padding])
        yield padded_batch

-
-class _Net_IdNameWrapper:
+def _Net_get_id_name(func, field):
    """
-    A simple wrapper that allows the ids propery to be accessed as a dict
-    indexed by names. Used for top and bottom names
-    """
-    def __init__(self, net, func):
-        self.net, self.func = net, func
+    Generic property that maps func to the layer names into an OrderedDict.

-    def __getitem__(self, name):
-        # Map the layer name to id
-        ids = self.func(self.net, list(self.net._layer_names).index(name))
-        # Map the blob id to name
-        id_to_name = list(self.net.blobs)
-        return [id_to_name[i] for i in ids]
+    Used for top_names and bottom_names.
+
+    Parameters
+    ----------
+    func: function id -> [id]
+    field: implementation field name (cache)
+
+    Returns
+    ------
+    A one-parameter function that can be set as a property.
+    """
+    @property
+    def get_id_name(self):
+        if not hasattr(self, field):
+            id_to_name = list(self.blobs)
+            res = OrderedDict([(self._layer_names[i],
+                                [id_to_name[j] for j in func(self, i)])
+                                for i in range(len(self.layers))])
+            setattr(self, field, res)
+        return getattr(self, field)
+    return get_id_name

 # Attach methods to Net.
 Net.blobs = _Net_blobs
@ -320,5 +330,5 @@ Net.set_input_arrays = _Net_set_input_arrays
 Net._batch = _Net_batch
 Net.inputs = _Net_inputs
 Net.outputs = _Net_outputs
-Net.top_names = property(lambda n: _Net_IdNameWrapper(n, Net._top_ids))
-Net.bottom_names = property(lambda n: _Net_IdNameWrapper(n, Net._bottom_ids))
+Net.top_names = _Net_get_id_name(Net._top_ids, "_top_names")
+Net.bottom_names = _Net_get_id_name(Net._bottom_ids, "_bottom_names")
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@ -3,6 +3,7 @@ import tempfile
 import os
 import numpy as np
 import six
+from collections import OrderedDict

 import caffe

@ -63,19 +64,282 @@ class TestNet(unittest.TestCase):
        self.net.forward()
        self.net.backward()

+    def test_clear_param_diffs(self):
+        # Run a forward/backward step to have non-zero diffs
+        self.net.forward()
+        self.net.backward()
+        diff = self.net.params["conv"][0].diff
+        # Check that we have non-zero diffs
+        self.assertTrue(diff.max() > 0)
+        self.net.clear_param_diffs()
+        # Check that the diffs are now 0
+        self.assertTrue((diff == 0).all())
+
    def test_inputs_outputs(self):
        self.assertEqual(self.net.inputs, [])
        self.assertEqual(self.net.outputs, ['loss'])

+    def test_top_bottom_names(self):
+        self.assertEqual(self.net.top_names,
+                         OrderedDict([('data', ['data', 'label']),
+                                      ('conv', ['conv']),
+                                      ('ip', ['ip']),
+                                      ('loss', ['loss'])]))
+        self.assertEqual(self.net.bottom_names,
+                         OrderedDict([('data', []),
+                                      ('conv', ['data']),
+                                      ('ip', ['conv']),
+                                      ('loss', ['ip', 'label'])]))
+
    def test_save_and_read(self):
        f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
        f.close()
        self.net.save(f.name)
        net_file = simple_net_file(self.num_output)
-        net2 = caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test legacy constructor
+        #   should print deprecation warning
+        caffe.Net(net_file, f.name, caffe.TRAIN)
+        # Test named constructor
+        net2 = caffe.Net(net_file, caffe.TRAIN, weights=f.name)
        os.remove(net_file)
        os.remove(f.name)
        for name in self.net.params:
            for i in range(len(self.net.params[name])):
                self.assertEqual(abs(self.net.params[name][i].data
                    - net2.params[name][i].data).sum(), 0)
+
+    def test_save_hdf5(self):
+        f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
+        f.close()
+        self.net.save_hdf5(f.name)
+        net_file = simple_net_file(self.num_output)
+        net2 = caffe.Net(net_file, caffe.TRAIN)
+        net2.load_hdf5(f.name)
+        os.remove(net_file)
+        os.remove(f.name)
+        for name in self.net.params:
+            for i in range(len(self.net.params[name])):
+                self.assertEqual(abs(self.net.params[name][i].data
+                    - net2.params[name][i].data).sum(), 0)
+
+class TestLevels(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "NoLevel"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "NoLevel"
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level0Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level0Only"
+  include { min_level: 0 max_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level1Only"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level1Only"
+  include { min_level: 1 max_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=0"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=0"
+  include { min_level: 0 }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "Level>=1"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "Level>=1"
+  include { min_level: 1 }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f.write(self.TEST_NET)
+        self.f.flush()
+
+    def tearDown(self):
+        self.f.close()
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_0(self):
+        net = caffe.Net(self.f.name, caffe.TEST)
+        self.check_net(net, ['NoLevel', 'Level0Only', 'Level>=0'])
+
+    def test_1(self):
+        net = caffe.Net(self.f.name, caffe.TEST, level=1)
+        self.check_net(net, ['NoLevel', 'Level1Only', 'Level>=0', 'Level>=1'])
+
+
+class TestStages(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "data"
+  type: "DummyData"
+  top: "data"
+  dummy_data_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+}
+layer {
+  name: "A"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "A"
+  include { stage: "A" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "B"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "B"
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AorB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AorB"
+  include { stage: "A" }
+  include { stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+layer {
+  name: "AandB"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "AandB"
+  include { stage: "A" stage: "B" }
+  inner_product_param { num_output: 1 }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f.write(self.TEST_NET)
+        self.f.flush()
+
+    def tearDown(self):
+        self.f.close()
+
+    def check_net(self, net, blobs):
+        net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
+        self.assertEqual(net_blobs, blobs)
+
+    def test_A(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A'])
+        self.check_net(net, ['A', 'AorB'])
+
+    def test_B(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['B'])
+        self.check_net(net, ['B', 'AorB'])
+
+    def test_AandB(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['A', 'B'])
+        self.check_net(net, ['A', 'B', 'AorB', 'AandB'])
+
+
+class TestAllInOne(unittest.TestCase):
+
+    TEST_NET = """
+layer {
+  name: "train_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TRAIN stage: "train" }
+}
+layer {
+  name: "val_data"
+  type: "DummyData"
+  top: "data"
+  top: "label"
+  dummy_data_param {
+    shape { dim: 1 dim: 1 dim: 10 dim: 10 }
+    shape { dim: 1 dim: 1 dim: 1 dim: 1 }
+  }
+  include { phase: TEST stage: "val" }
+}
+layer {
+  name: "deploy_data"
+  type: "Input"
+  top: "data"
+  input_param { shape { dim: 1 dim: 1 dim: 10 dim: 10 } }
+  include { phase: TEST stage: "deploy" }
+}
+layer {
+  name: "ip"
+  type: "InnerProduct"
+  bottom: "data"
+  top: "ip"
+  inner_product_param { num_output: 2 }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip"
+  bottom: "label"
+  top: "loss"
+  include: { phase: TRAIN stage: "train" }
+  include: { phase: TEST stage: "val" }
+}
+layer {
+  name: "pred"
+  type: "Softmax"
+  bottom: "ip"
+  top: "pred"
+  include: { phase: TEST stage: "deploy" }
+}
+"""
+
+    def setUp(self):
+        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f.write(self.TEST_NET)
+        self.f.flush()
+
+    def tearDown(self):
+        self.f.close()
+
+    def check_net(self, net, outputs):
+        self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])
+        self.assertEqual(net.outputs, outputs)
+
+    def test_train(self):
+        net = caffe.Net(self.f.name, caffe.TRAIN, stages=['train'])
+        self.check_net(net, ['loss'])
+
+    def test_val(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['val'])
+        self.check_net(net, ['loss'])
+
+    def test_deploy(self):
+        net = caffe.Net(self.f.name, caffe.TEST, stages=['deploy'])
+        self.check_net(net, ['pred'])
+
--- a/python/draw_net.py
+++ b/python/draw_net.py
@ -28,6 +28,11 @@ def parse_args():
                              'http://www.graphviz.org/doc/info/'
                              'attrs.html#k:rankdir'),
                        default='LR')
+    parser.add_argument('--phase',
+                        help=('Which network phase to draw: can be TRAIN, '
+                              'TEST, or ALL.  If ALL, then all layers are drawn '
+                              'regardless of phase.'),
+                        default="ALL")

    args = parser.parse_args()
    return args
@ -38,7 +43,15 @@ def main():
    net = caffe_pb2.NetParameter()
    text_format.Merge(open(args.input_net_proto_file).read(), net)
    print('Drawing net to %s' % args.output_image_file)
-    caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir)
+    phase=None;
+    if args.phase == "TRAIN":
+        phase = caffe.TRAIN
+    elif args.phase == "TEST":
+        phase = caffe.TEST
+    elif args.phase != "ALL":
+        raise ValueError("Unknown phase: " + args.phase)
+    caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
+                                phase)


 if __name__ == '__main__':
--- a/scripts/travis/build.sh
+++ b/scripts/travis/build.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+# build the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_CMAKE ; then
+  make --jobs $NUM_THREADS all test pycaffe warn
+else
+  cd build
+  make --jobs $NUM_THREADS all test.testbin
+fi
+make lint
--- a/scripts/travis/configure-cmake.sh
+++ b/scripts/travis/configure-cmake.sh
@ -0,0 +1,32 @@
+# CMake configuration
+
+mkdir -p build
+cd build
+
+ARGS="-DCMAKE_BUILD_TYPE=Release -DBLAS=Open"
+
+if $WITH_PYTHON3 ; then
+  ARGS="$ARGS -Dpython_version=3"
+fi
+
+if $WITH_IO ; then
+  ARGS="$ARGS -DUSE_OPENCV=On -DUSE_LMDB=On -DUSE_LEVELDB=On"
+else
+  ARGS="$ARGS -DUSE_OPENCV=Off -DUSE_LMDB=Off -DUSE_LEVELDB=Off"
+fi
+
+if $WITH_CUDA ; then
+  # Only build SM50
+  ARGS="$ARGS -DCPU_ONLY=Off -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=\"50\" -DCUDA_ARCH_PTX=\"\""
+else
+  ARGS="$ARGS -DCPU_ONLY=On"
+fi
+
+if $WITH_CUDNN ; then
+  ARGS="$ARGS -DUSE_CUDNN=On"
+else
+  ARGS="$ARGS -DUSE_CUDNN=Off"
+fi
+
+cmake .. $ARGS
+
--- a/scripts/travis/configure-make.sh
+++ b/scripts/travis/configure-make.sh
@ -0,0 +1,36 @@
+# raw Makefile configuration
+
+LINE () {
+  echo "$@" >> Makefile.config
+}
+
+cp Makefile.config.example Makefile.config
+
+LINE "BLAS := open"
+LINE "WITH_PYTHON_LAYER := 1"
+
+if $WITH_PYTHON3 ; then
+  # TODO(lukeyeager) this path is currently disabled because of test errors like:
+  #   ImportError: dynamic module does not define init function (PyInit__caffe)
+  LINE "PYTHON_LIBRARIES := python3.4m boost_python-py34"
+  LINE "PYTHON_INCLUDE := /usr/include/python3.4 /usr/lib/python3/dist-packages/numpy/core/include"
+  LINE "INCLUDE_DIRS := \$(INCLUDE_DIRS) \$(PYTHON_INCLUDE)"
+fi
+
+if ! $WITH_IO ; then
+  LINE "USE_OPENCV := 0"
+  LINE "USE_LEVELDB := 0"
+  LINE "USE_LMDB := 0"
+fi
+
+if $WITH_CUDA ; then
+  # Only build SM50
+  LINE "CUDA_ARCH := -gencode arch=compute_50,code=sm_50"
+else
+  LINE "CPU_ONLY := 1"
+fi
+
+if $WITH_CUDNN ; then
+  LINE "USE_CUDNN := 1"
+fi
+
--- a/scripts/travis/configure.sh
+++ b/scripts/travis/configure.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+# configure the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_CMAKE ; then
+  source $BASEDIR/configure-make.sh
+else
+  source $BASEDIR/configure-cmake.sh
+fi
--- a/scripts/travis/defaults.sh
+++ b/scripts/travis/defaults.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+# set default environment variables
+
+set -e
+
+WITH_CMAKE=${WITH_CMAKE:-false}
+WITH_PYTHON3=${WITH_PYTHON3:-false}
+WITH_IO=${WITH_IO:-true}
+WITH_CUDA=${WITH_CUDA:-false}
+WITH_CUDNN=${WITH_CUDNN:-false}
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+# install dependencies
+# (this script must be run as root)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+apt-get -y update
+apt-get install -y --no-install-recommends \
+  build-essential \
+  libboost-filesystem-dev \
+  libboost-python-dev \
+  libboost-system-dev \
+  libboost-thread-dev \
+  libgflags-dev \
+  libgoogle-glog-dev \
+  libhdf5-serial-dev \
+  libopenblas-dev \
+  python-virtualenv \
+  wget
+
+if $WITH_CMAKE ; then
+  apt-get install -y --no-install-recommends cmake
+fi
+
+if ! $WITH_PYTHON3 ; then
+  # Python2
+  apt-get install -y --no-install-recommends \
+    libprotobuf-dev \
+    protobuf-compiler \
+    python-dev \
+    python-numpy \
+    python-protobuf \
+    python-skimage
+else
+  # Python3
+  apt-get install -y --no-install-recommends \
+    python3-dev \
+    python3-numpy \
+    python3-skimage
+
+  # build Protobuf3 since it's needed for Python3
+  PROTOBUF3_DIR=~/protobuf3
+  pushd .
+  if [ -d "$PROTOBUF3_DIR" ] && [ -e "$PROTOBUF3_DIR/src/protoc" ]; then
+    echo "Using cached protobuf3 build ..."
+    cd $PROTOBUF3_DIR
+  else
+    echo "Building protobuf3 from source ..."
+    rm -rf $PROTOBUF3_DIR
+    mkdir $PROTOBUF3_DIR
+
+    # install some more dependencies required to build protobuf3
+    apt-get install -y --no-install-recommends \
+      curl \
+      dh-autoreconf \
+      unzip
+
+    wget https://github.com/google/protobuf/archive/v3.0.0-beta-3.tar.gz -O protobuf3.tar.gz
+    tar -xzf protobuf3.tar.gz -C $PROTOBUF3_DIR --strip 1
+    rm protobuf3.tar.gz
+    cd $PROTOBUF3_DIR
+    ./autogen.sh
+    ./configure --prefix=/usr
+    make --jobs=$NUM_THREADS
+  fi
+  make install
+  popd
+fi
+
+if $WITH_IO ; then
+  apt-get install -y --no-install-recommends \
+    libleveldb-dev \
+    liblmdb-dev \
+    libopencv-dev \
+    libsnappy-dev
+fi
+
+if $WITH_CUDA ; then
+  # install repo packages
+  CUDA_REPO_PKG=cuda-repo-ubuntu1404_7.5-18_amd64.deb
+  wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG
+  dpkg -i $CUDA_REPO_PKG
+  rm $CUDA_REPO_PKG
+
+  if $WITH_CUDNN ; then
+    ML_REPO_PKG=nvidia-machine-learning-repo_4.0-2_amd64.deb
+    wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKG
+    dpkg -i $ML_REPO_PKG
+  fi
+
+  # update package lists
+  apt-get -y update
+
+  # install packages
+  CUDA_PKG_VERSION="7-5"
+  CUDA_VERSION="7.5"
+  apt-get install -y --no-install-recommends \
+    cuda-core-$CUDA_PKG_VERSION \
+    cuda-cudart-dev-$CUDA_PKG_VERSION \
+    cuda-cublas-dev-$CUDA_PKG_VERSION \
+    cuda-curand-dev-$CUDA_PKG_VERSION
+  # manually create CUDA symlink
+  ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
+
+  if $WITH_CUDNN ; then
+    apt-get install -y --no-install-recommends libcudnn5-dev
+  fi
+fi
+
--- a/scripts/travis/install-python-deps.sh
+++ b/scripts/travis/install-python-deps.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+# install extra Python dependencies
+# (must come after setup-venv)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if ! $WITH_PYTHON3 ; then
+  # Python2
+  :
+else
+  # Python3
+  pip install --pre protobuf==3.0.0b3
+fi
--- a/scripts/travis/setup-venv.sh
+++ b/scripts/travis/setup-venv.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+# setup a Python virtualenv
+# (must come after install-deps)
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+VENV_DIR=${1:-~/venv}
+
+# setup our own virtualenv
+if $WITH_PYTHON3; then
+    PYTHON_EXE='/usr/bin/python3'
+else
+    PYTHON_EXE='/usr/bin/python2'
+fi
+
+# use --system-site-packages so that Python will use deb packages
+virtualenv $VENV_DIR -p $PYTHON_EXE --system-site-packages
--- a/scripts/travis/test.sh
+++ b/scripts/travis/test.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+# test the project
+
+BASEDIR=$(dirname $0)
+source $BASEDIR/defaults.sh
+
+if $WITH_CUDA ; then
+  echo "Skipping tests for CUDA build"
+  exit 0
+fi
+
+if ! $WITH_CMAKE ; then
+  make runtest
+  make pytest
+else
+  cd build
+  make runtest
+  make pytest
+fi
--- a/scripts/travis/travis_build_and_test.sh
+++ b/scripts/travis/travis_build_and_test.sh
@ -1,54 +0,0 @@
-#!/bin/bash
-# Script called by Travis to build and test Caffe.
-# Travis CI tests are CPU-only for lack of compatible hardware.
-
-set -e
-MAKE="make --jobs=$NUM_THREADS --keep-going"
-
-if $WITH_CMAKE; then
-  mkdir build
-  cd build
-  CPU_ONLY=" -DCPU_ONLY=ON"
-  if ! $WITH_CUDA; then
-    CPU_ONLY=" -DCPU_ONLY=OFF"
-  fi
-  PYTHON_ARGS=""
-  if [ "$PYTHON_VERSION" = "3" ]; then
-    PYTHON_ARGS="$PYTHON_ARGS -Dpython_version=3 -DBOOST_LIBRARYDIR=$CONDA_DIR/lib/"
-  fi
-  if $WITH_IO; then
-    IO_ARGS="-DUSE_OPENCV=ON -DUSE_LMDB=ON -DUSE_LEVELDB=ON"
-  else
-    IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF"
-  fi
-  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS ..
-  $MAKE
-  $MAKE pytest
-  if ! $WITH_CUDA; then
-    $MAKE runtest
-    $MAKE lint
-  fi
-  $MAKE clean
-  cd -
-else
-  if ! $WITH_CUDA; then
-    export CPU_ONLY=1
-  fi
-  if $WITH_IO; then
-    export USE_LMDB=1
-    export USE_LEVELDB=1
-    export USE_OPENCV=1
-  fi
-  $MAKE all test pycaffe warn lint || true
-  if ! $WITH_CUDA; then
-    $MAKE runtest
-  fi
-  $MAKE all
-  $MAKE test
-  $MAKE pycaffe
-  $MAKE pytest
-  $MAKE warn
-  if ! $WITH_CUDA; then
-    $MAKE lint
-  fi
-fi
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ b/scripts/travis/travis_setup_makefile_config.sh
@ -1,31 +0,0 @@
-#!/bin/bash
-
-set -e
-
-mv Makefile.config.example Makefile.config
-
-if $WITH_CUDA; then
-  # Only generate compute_50.
-  GENCODE="-gencode arch=compute_50,code=sm_50"
-  GENCODE="$GENCODE -gencode arch=compute_50,code=compute_50"
-  echo "CUDA_ARCH := $GENCODE" >> Makefile.config
-fi
-
-# Remove IO library settings from Makefile.config
-# to avoid conflicts with CI configuration
-sed -i -e '/USE_LMDB/d' Makefile.config
-sed -i -e '/USE_LEVELDB/d' Makefile.config
-sed -i -e '/USE_OPENCV/d' Makefile.config
-
-cat << 'EOF' >> Makefile.config
-# Travis' nvcc doesn't like newer boost versions
-NVCCFLAGS := -Xcudafe --diag_suppress=cc_clobber_ignored -Xcudafe --diag_suppress=useless_using_declaration -Xcudafe --diag_suppress=set_but_not_used
-ANACONDA_HOME := $(CONDA_DIR)
-PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
-		$(ANACONDA_HOME)/include/python2.7 \
-		$(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
-PYTHON_LIB := $(ANACONDA_HOME)/lib
-INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
-LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
-WITH_PYTHON_LAYER := 1
-EOF
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@ -30,19 +30,11 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
      // Bias.
      if (this->bias_term_) {
        const Dtype* bias_data = this->blobs_[1]->gpu_data();
-#if CUDNN_VERSION_MIN(4, 0, 0)
        CUDNN_CHECK(cudnnAddTensor(handle_[g],
              cudnn::dataType<Dtype>::one,
              bias_desc_, bias_data + bias_offset_ * g,
              cudnn::dataType<Dtype>::one,
              top_descs_[i], top_data + top_offset_ * g));
-#else
-        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-#endif
      }
    }

@ -82,7 +74,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
      // Gradient w.r.t. weights.
      if (this->param_propagate_down_[0]) {
        const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter_v3(
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
              handle_[1*this->group_ + g],
              cudnn::dataType<Dtype>::one,
              bottom_descs_[i], bottom_data + bottom_offset_ * g,
@ -100,7 +92,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
          weight = this->blobs_[0]->gpu_data();
        }
        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData_v3(
+        CUDNN_CHECK(cudnnConvolutionBackwardData(
              handle_[2*this->group_ + g],
              cudnn::dataType<Dtype>::one,
              filter_desc_, weight + this->weight_offset_ * g,
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@ -13,6 +13,7 @@ void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_RELU);
  handles_setup_ = true;
 }

--- a/src/caffe/layers/cudnn_relu_layer.cu
+++ b/src/caffe/layers/cudnn_relu_layer.cu
@ -15,12 +15,21 @@ void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,

  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }

 template <typename Dtype>
@ -40,13 +49,23 @@ void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  const Dtype* top_diff = top[0]->gpu_diff();
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->top_desc_, top_data, this->top_desc_, top_diff,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }

 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cpp
@ -13,6 +13,8 @@ void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_,
+      CUDNN_ACTIVATION_SIGMOID);
  handles_setup_ = true;
 }

--- a/src/caffe/layers/cudnn_sigmoid_layer.cu
+++ b/src/caffe/layers/cudnn_sigmoid_layer.cu
@ -10,12 +10,21 @@ void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }

 template <typename Dtype>
@ -30,13 +39,23 @@ void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  const Dtype* top_diff = top[0]->gpu_diff();
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->top_desc_, top_data, this->top_desc_, top_diff,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }

 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ b/src/caffe/layers/cudnn_tanh_layer.cpp
@ -13,6 +13,7 @@ void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
  CUDNN_CHECK(cudnnCreate(&handle_));
  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
+  cudnn::createActivationDescriptor<Dtype>(&activ_desc_, CUDNN_ACTIVATION_TANH);
  handles_setup_ = true;
 }

--- a/src/caffe/layers/cudnn_tanh_layer.cu
+++ b/src/caffe/layers/cudnn_tanh_layer.cu
@ -10,12 +10,21 @@ void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->top_desc_, top_data));
+#else
+  CUDNN_CHECK(cudnnActivationForward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->top_desc_, top_data));
+#endif
 }

 template <typename Dtype>
@ -31,13 +40,23 @@ void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();

+#if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
+        activ_desc_,
        cudnn::dataType<Dtype>::one,
        this->top_desc_, top_data, this->top_desc_, top_diff,
        this->bottom_desc_, bottom_data,
        cudnn::dataType<Dtype>::zero,
        this->bottom_desc_, bottom_diff));
+#else
+  CUDNN_CHECK(cudnnActivationBackward_v4(this->handle_,
+        activ_desc_,
+        cudnn::dataType<Dtype>::one,
+        this->top_desc_, top_data, this->top_desc_, top_diff,
+        this->bottom_desc_, bottom_data,
+        cudnn::dataType<Dtype>::zero,
+        this->bottom_desc_, bottom_diff));
+#endif
 }

 INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@ -37,12 +37,17 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
  const string& source = this->layer_param_.image_data_param().source();
  LOG(INFO) << "Opening file " << source;
  std::ifstream infile(source.c_str());
-  string filename;
+  string line;
+  size_t pos;
  int label;
-  while (infile >> filename >> label) {
-    lines_.push_back(std::make_pair(filename, label));
+  while (std::getline(infile, line)) {
+    pos = line.find_last_of(' ');
+    label = atoi(line.substr(pos + 1).c_str());
+    lines_.push_back(std::make_pair(line.substr(0, pos), label));
  }

+  CHECK(!lines_.empty()) << "File is empty";
+
  if (this->layer_param_.image_data_param().shuffle()) {
    // randomly shuffle data
    LOG(INFO) << "Shuffling data";
--- a/src/caffe/layers/lstm_layer.cpp
+++ b/src/caffe/layers/lstm_layer.cpp
@ -0,0 +1,244 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + format_int(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  const int num_blobs = 2;
+  shapes->resize(num_blobs);
+  for (int i = 0; i < num_blobs; ++i) {
+    (*shapes)[i].Clear();
+    (*shapes)[i].add_dim(1);  // a single timestep
+    (*shapes)[i].add_dim(this->N_);
+    (*shapes)[i].add_dim(num_output);
+  }
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(2, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+
+  input_layer_param->add_top("c_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[1]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xc_x_static_reshape");
+    reshape_param->add_bottom("W_xc_x_static_preshape");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + format_int(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@ -0,0 +1,131 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num_instances = bottom[0]->shape(1);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (i == 2) {
+      CHECK_EQ(2, bottom[i]->num_axes());
+    } else {
+      CHECK_EQ(3, bottom[i]->num_axes());
+    }
+    CHECK_EQ(1, bottom[i]->shape(0));
+    CHECK_EQ(num_instances, bottom[i]->shape(1));
+  }
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(num_instances, bottom[1]->shape(1));
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_.ReshapeLike(*bottom[1]);
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* cont = bottom[2]->cpu_data();
+  Dtype* C = top[0]->mutable_cpu_data();
+  Dtype* H = top[1]->mutable_cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = f * c_prev + i * g;
+      C[d] = c;
+      const Dtype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++cont;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* cont = bottom[2]->cpu_data();
+  const Dtype* C = top[0]->cpu_data();
+  const Dtype* H = top[1]->cpu_data();
+  const Dtype* C_diff = top[0]->cpu_diff();
+  const Dtype* H_diff = top[1]->cpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* X_diff = bottom[1]->mutable_cpu_diff();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*cont == 0) ? 0 :
+          (*cont * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = C[d];
+      const Dtype tanh_c = tanh(c);
+      Dtype* c_prev_diff = C_prev_diff + d;
+      Dtype* i_diff = X_diff + d;
+      Dtype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Dtype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Dtype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Dtype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++cont;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LSTMUnitLayer);
+#endif
+
+INSTANTIATE_CLASS(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
--- a/src/caffe/layers/lstm_unit_layer.cu
+++ b/src/caffe/layers/lstm_unit_layer.cu
@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* cont,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = cont[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X = bottom[1]->gpu_data();
+  const Dtype* cont = bottom[2]->gpu_data();
+  Dtype* X_acts = X_acts_.mutable_gpu_data();
+  Dtype* C = top[0]->mutable_gpu_data();
+  Dtype* H = top[1]->mutable_gpu_data();
+  const int X_count = bottom[1]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, hidden_dim_, C_prev, X_acts, cont, C, H);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* cont, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype cont_n = cont[n];
+    *c_prev_diff = cont_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = cont_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X_acts = X_acts_.gpu_data();
+  const Dtype* cont = bottom[2]->gpu_data();
+  const Dtype* C = top[0]->gpu_data();
+  const Dtype* H = top[1]->gpu_data();
+  const Dtype* C_diff = top[0]->gpu_diff();
+  const Dtype* H_diff = top[1]->gpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
+  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, cont, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  const int X_count = bottom[1]->count();
+  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
+  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
+
+}  // namespace caffe
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@ -19,16 +19,10 @@ void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
  CHECK_GT(batch_size_ * size_, 0) <<
      "batch_size, channels, height, and width must be specified and"
      " positive in memory_data_param";
-  int crop_size = this->transform_param_.crop_size();
-  if (crop_size > 0) {
-    top[0]->Reshape(batch_size_, channels_, crop_size, crop_size);
-    added_data_.Reshape(batch_size_, channels_, crop_size, crop_size);
-  } else {
-    top[0]->Reshape(batch_size_, channels_, height_, width_);
-    added_data_.Reshape(batch_size_, channels_, height_, width_);
-  }
  vector<int> label_shape(1, batch_size_);
+  top[0]->Reshape(batch_size_, channels_, height_, width_);
  top[1]->Reshape(label_shape);
+  added_data_.Reshape(batch_size_, channels_, height_, width_);
  added_label_.Reshape(label_shape);
  data_ = NULL;
  labels_ = NULL;
@ -44,11 +38,7 @@ void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
  CHECK_GT(num, 0) << "There is no datum to add.";
  CHECK_EQ(num % batch_size_, 0) <<
      "The added data must be a multiple of the batch size.";
-  int crop_size = this->transform_param_.crop_size();
-  if (crop_size > 0)
-    added_data_.Reshape(num, channels_, crop_size, crop_size);
-  else
-    added_data_.Reshape(num, channels_, height_, width_);
+  added_data_.Reshape(num, channels_, height_, width_);
  added_label_.Reshape(num, 1, 1, 1);
  // Apply data transformations (mirror, scale, crop...)
  this->data_transformer_->Transform(datum_vector, &added_data_);
@ -73,11 +63,7 @@ void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
  CHECK_GT(num, 0) << "There is no mat to add";
  CHECK_EQ(num % batch_size_, 0) <<
      "The added data must be a multiple of the batch size.";
-  int crop_size = this->transform_param_.crop_size();
-  if (crop_size > 0)
-    added_data_.Reshape(num, channels_, crop_size, crop_size);
-  else
-    added_data_.Reshape(num, channels_, height_, width_);
+  added_data_.Reshape(num, channels_, height_, width_);
  added_label_.Reshape(num, 1, 1, 1);
  // Apply data transformations (mirror, scale, crop...)
  this->data_transformer_->Transform(mat_vector, &added_data_);
@ -101,10 +87,7 @@ void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
  // Warn with transformation parameters since a memory array is meant to
  // be generic and no transformations are done with Reset().
  if (this->layer_param_.has_transform_param()) {
-    // suppress this warning as we have applied transformation before calling
-    // Reset
-    // LOG(WARNING) << this->type() << " does not transform array data on
-    // Reset()";
+    LOG(WARNING) << this->type() << " does not transform array data on Reset()";
  }
  data_ = data;
  labels_ = labels;
@ -124,12 +107,8 @@ void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
-  CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
-  int crop_size = this->transform_param_.crop_size();
-  if (crop_size > 0)
-    top[0]->Reshape(batch_size_, channels_, crop_size, crop_size);
-  else
-    top[0]->Reshape(batch_size_, channels_, height_, width_);
+  CHECK(data_) << "MemoryDataLayer needs to be initialized by calling Reset";
+  top[0]->Reshape(batch_size_, channels_, height_, width_);
  top[1]->Reshape(batch_size_, 1, 1, 1);
  top[0]->set_cpu_data(data_ + pos_ * size_);
  top[1]->set_cpu_data(labels_ + pos_);
--- a/src/caffe/layers/recurrent_layer.cpp
+++ b/src/caffe/layers/recurrent_layer.cpp
@ -0,0 +1,295 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If expose_hidden is set, we take as input and produce as output
+  // the hidden state blobs at the first and last timesteps.
+  expose_hidden_ = this->layer_param_.recurrent_param().expose_hidden();
+
+  // Get (recurrent) input/output names.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  const int num_hidden_exposed = expose_hidden_ * num_recur_blobs;
+  static_input_ = (bottom.size() > 2 + num_hidden_exposed);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+
+  LayerParameter* input_layer_param = net_param.add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  input_layer_param->add_top("cont");
+  input_param->add_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    input_layer_param->add_top("x_static");
+    input_param->add_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size()) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Add "pseudo-losses" to all outputs to force backpropagation.
+  // (Setting force_backward is too aggressive as we may not need to backprop to
+  // all inputs, e.g., the sequence continuation indicators.)
+  vector<string> pseudo_losses(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    LayerParameter* layer = net_param.add_layer();
+    pseudo_losses[i] = output_names[i] + "_pseudoloss";
+    layer->set_name(pseudo_losses[i]);
+    layer->set_type("Reduction");
+    layer->add_bottom(output_names[i]);
+    layer->add_top(pseudo_losses[i]);
+    layer->add_loss_weight(1);
+  }
+
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net<Dtype>(net_param));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  CHECK_EQ(top.size() - num_hidden_exposed, output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff());
+  }
+
+  // Check that the last output_names.size() layers are the pseudo-losses;
+  // set last_layer_index so that we don't actually run these layers.
+  const vector<string>& layer_names = unrolled_net_->layer_names();
+  last_layer_index_ = layer_names.size() - 1 - pseudo_losses.size();
+  for (int i = last_layer_index_ + 1, j = 0; i < layer_names.size(); ++i, ++j) {
+    CHECK_EQ(layer_names[i], pseudo_losses[j]);
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  CHECK_EQ(T_, bottom[0]->shape(0)) << "input number of timesteps changed";
+  N_ = bottom[0]->shape(1);
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+  x_input_blob_->ReshapeLike(*bottom[0]);
+  vector<int> cont_shape = bottom[1]->shape();
+  cont_input_blob_->Reshape(cont_shape);
+  if (static_input_) {
+    x_static_input_blob_->ReshapeLike(*bottom[2]);
+  }
+  vector<BlobShape> recur_input_shapes;
+  RecurrentInputShapes(&recur_input_shapes);
+  CHECK_EQ(recur_input_shapes.size(), recur_input_blobs_.size());
+  for (int i = 0; i < recur_input_shapes.size(); ++i) {
+    recur_input_blobs_[i]->Reshape(recur_input_shapes[i]);
+  }
+  unrolled_net_->Reshape();
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+  if (expose_hidden_) {
+    const int bottom_offset = 2 + static_input_;
+    for (int i = bottom_offset, j = 0; i < bottom.size(); ++i, ++j) {
+      CHECK(recur_input_blobs_[j]->shape() == bottom[i]->shape())
+          << "bottom[" << i << "] shape must match hidden state input shape: "
+          << recur_input_blobs_[j]->shape_string();
+      recur_input_blobs_[j]->ShareData(*bottom[i]);
+    }
+  }
+  for (int i = 0; i < output_blobs_.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    top[i]->ShareData(*output_blobs_[i]);
+    top[i]->ShareDiff(*output_blobs_[i]);
+  }
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ReshapeLike(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time: reshare all the internal shared blobs, which may
+  // currently point to a stale owner blob that was dropped when Solver::Test
+  // called test_net->ShareTrainedLayersWith(net_.get()).
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data();
+      Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+
+  // TODO: skip backpropagation to inputs and parameters inside the unrolled
+  // net according to propagate_down[0] and propagate_down[2]. For now just
+  // backprop to inputs and parameters unconditionally, as either the inputs or
+  // the parameters do need backward (or Net would have set
+  // layer_needs_backward_[i] == false for this layer).
+  unrolled_net_->BackwardFrom(last_layer_index_);
+}
+
+#ifdef CPU_ONLY
+STUB_GPU_FORWARD(RecurrentLayer, Forward);
+#endif
+
+INSTANTIATE_CLASS(RecurrentLayer);
+
+}  // namespace caffe
--- a/src/caffe/layers/recurrent_layer.cu
+++ b/src/caffe/layers/recurrent_layer.cu
@ -0,0 +1,44 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeights();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  if (!expose_hidden_) {
+    for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+      const int count = recur_input_blobs_[i]->count();
+      DCHECK_EQ(count, recur_output_blobs_[i]->count());
+      const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
+      Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
+      caffe_copy(count, timestep_T_data, timestep_0_data);
+    }
+  }
+
+  unrolled_net_->ForwardTo(last_layer_index_);
+
+  if (expose_hidden_) {
+    const int top_offset = output_blobs_.size();
+    for (int i = top_offset, j = 0; i < top.size(); ++i, ++j) {
+      top[i]->ShareData(*recur_output_blobs_[j]);
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
+
+}  // namespace caffe
--- a/src/caffe/layers/rnn_layer.cpp
+++ b/src/caffe/layers/rnn_layer.cpp
@ -0,0 +1,236 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + format_int(this->T_);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputShapes(vector<BlobShape>* shapes) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  shapes->resize(1);
+  (*shapes)[0].Clear();
+  (*shapes)[0].add_dim(1);  // a single timestep
+  (*shapes)[0].add_dim(this->N_);
+  (*shapes)[0].add_dim(num_output);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter scale_param;
+  scale_param.set_type("Scale");
+  scale_param.mutable_scale_param()->set_axis(0);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  vector<BlobShape> input_shapes;
+  RecurrentInputShapes(&input_shapes);
+  CHECK_EQ(1, input_shapes.size());
+
+  LayerParameter* input_layer_param = net_param->add_layer();
+  input_layer_param->set_type("Input");
+  InputParameter* input_param = input_layer_param->mutable_input_param();
+  input_layer_param->add_top("h_0");
+  input_param->add_shape()->CopyFrom(input_shapes[0]);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(0);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+    x_transform_param->add_propagate_down(true);
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static_preshape");
+    x_static_transform_param->add_propagate_down(true);
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    // Should infer this->N as the dimension so we can reshape on batch size.
+    new_shape->add_dim(-1);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static_preshape");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = format_int(t - 1);
+    string ts = format_int(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(scale_param);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@ -265,6 +265,9 @@ void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
  const int num_samples[2] = { batch_size - num_fg, num_fg };

  int item_id = 0;
+  CHECK_GT(fg_windows_.size(), 0);
+  CHECK_GT(bg_windows_.size(), 0);
+
  // sample from bg set then fg set
  for (int is_fg = 0; is_fg < 2; ++is_fg) {
    for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@ -28,11 +28,20 @@ Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
 }

 template <typename Dtype>
-Net<Dtype>::Net(const string& param_file, Phase phase, const Net* root_net)
+Net<Dtype>::Net(const string& param_file, Phase phase,
+    const int level, const vector<string>* stages,
+    const Net* root_net)
    : root_net_(root_net) {
  NetParameter param;
  ReadNetParamsFromTextFileOrDie(param_file, &param);
+  // Set phase, stages and level
  param.mutable_state()->set_phase(phase);
+  if (stages != NULL) {
+    for (int i = 0; i < stages->size(); i++) {
+      param.mutable_state()->add_stage((*stages)[i]);
+    }
+  }
+  param.mutable_state()->set_level(level);
  Init(param);
 }

--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 146 (last added: parameter_param)
+// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
 message LayerParameter {
  optional string name = 1; // the layer name
  optional string type = 2; // the layer type
@ -390,6 +390,7 @@ message LayerParameter {
  optional PowerParameter power_param = 122;
  optional PReLUParameter prelu_param = 131;
  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
  optional ReductionParameter reduction_param = 136;
  optional ReLUParameter relu_param = 123;
  optional ReshapeParameter reshape_param = 133;
@ -930,6 +931,25 @@ message PythonParameter {
  optional bool share_in_parallel = 4 [default = false];
 }

+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
 // Message that stores parameters used by ReductionLayer
 message ReductionParameter {
  enum ReductionOp {
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@ -1,6 +1,3 @@
-// The main caffe test code. Your test cpp code should include this hpp
-// to allow a main function to be compiled into the binary.
-
 #include "caffe/caffe.hpp"
 #include "caffe/test/test_caffe_main.hpp"

--- a/src/caffe/test/test_embed_layer.cpp
+++ b/src/caffe/test/test_embed_layer.cpp
@ -124,7 +124,7 @@ TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
    top_offset[4] = 0;
    bias_offset[0] = 0;
    for (int j = 0; j < kNumOutput; ++j) {
-      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
+      EXPECT_FLOAT_EQ(layer->blobs()[0]->data_at(weight_offset) +
                layer->blobs()[1]->data_at(bias_offset),
                this->blob_top_->data_at(top_offset));
      ++top_offset[4];
--- a/src/caffe/test/test_image_data_layer.cpp
+++ b/src/caffe/test/test_image_data_layer.cpp
@ -34,16 +34,24 @@ class ImageDataLayerTest : public MultiDeviceTest<TypeParam> {
    std::ofstream outfile(filename_.c_str(), std::ofstream::out);
    LOG(INFO) << "Using temporary file " << filename_;
    for (int i = 0; i < 5; ++i) {
-      outfile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << i;
+      outfile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << i << std::endl;
    }
    outfile.close();
    // Create test input file for images of distinct sizes.
    MakeTempFilename(&filename_reshape_);
    std::ofstream reshapefile(filename_reshape_.c_str(), std::ofstream::out);
    LOG(INFO) << "Using temporary file " << filename_reshape_;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0;
-    reshapefile << EXAMPLES_SOURCE_DIR "images/fish-bike.jpg " << 1;
+    reshapefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
+    reshapefile << EXAMPLES_SOURCE_DIR "images/fish-bike.jpg " << 1
+                << std::endl;
    reshapefile.close();
+    // Create test input file for images with space in names
+    MakeTempFilename(&filename_space_);
+    std::ofstream spacefile(filename_space_.c_str(), std::ofstream::out);
+    LOG(INFO) << "Using temporary file " << filename_space_;
+    spacefile << EXAMPLES_SOURCE_DIR "images/cat.jpg " << 0 << std::endl;
+    spacefile << EXAMPLES_SOURCE_DIR "images/cat gray.jpg " << 1 << std::endl;
+    spacefile.close();
  }

  virtual ~ImageDataLayerTest() {
@ -54,6 +62,7 @@ class ImageDataLayerTest : public MultiDeviceTest<TypeParam> {
  int seed_;
  string filename_;
  string filename_reshape_;
+  string filename_space_;
  Blob<Dtype>* const blob_top_data_;
  Blob<Dtype>* const blob_top_label_;
  vector<Blob<Dtype>*> blob_bottom_vec_;
@ -177,5 +186,34 @@ TYPED_TEST(ImageDataLayerTest, TestShuffle) {
  }
 }

+TYPED_TEST(ImageDataLayerTest, TestSpace) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter param;
+  ImageDataParameter* image_data_param = param.mutable_image_data_param();
+  image_data_param->set_batch_size(1);
+  image_data_param->set_source(this->filename_space_.c_str());
+  image_data_param->set_shuffle(false);
+  ImageDataLayer<Dtype> layer(param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_label_->num(), 1);
+  EXPECT_EQ(this->blob_top_label_->channels(), 1);
+  EXPECT_EQ(this->blob_top_label_->height(), 1);
+  EXPECT_EQ(this->blob_top_label_->width(), 1);
+  // cat.jpg
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_data_->num(), 1);
+  EXPECT_EQ(this->blob_top_data_->channels(), 3);
+  EXPECT_EQ(this->blob_top_data_->height(), 360);
+  EXPECT_EQ(this->blob_top_data_->width(), 480);
+  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 0);
+  // cat gray.jpg
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_data_->num(), 1);
+  EXPECT_EQ(this->blob_top_data_->channels(), 3);
+  EXPECT_EQ(this->blob_top_data_->height(), 360);
+  EXPECT_EQ(this->blob_top_data_->width(), 480);
+  EXPECT_EQ(this->blob_top_label_->cpu_data()[0], 1);
+}
+
 }  // namespace caffe
 #endif  // USE_OPENCV
--- a/src/caffe/test/test_lstm_layer.cpp
+++ b/src/caffe/test/test_lstm_layer.cpp
@ -0,0 +1,288 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/lstm_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_cont_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape.resize(2);
+    shape[0] = 1; shape[1] = num_instances;
+    unit_blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_cont_;
+  Blob<Dtype> blob_bottom_static_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype> unit_blob_bottom_cont_;
+  Blob<Dtype> unit_blob_bottom_c_prev_;
+  Blob<Dtype> unit_blob_bottom_x_;
+  Blob<Dtype> unit_blob_top_c_;
+  Blob<Dtype> unit_blob_top_h_;
+  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
+  vector<Blob<Dtype>*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  Caffe::set_random_seed(1);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 0;
+  cont_data[1] = 0;
+  cont_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* cont_data = this->blob_bottom_cont_.mutable_cpu_data();
+  cont_data[0] = 1;
+  cont_data[1] = 0;
+  cont_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+
+}  // namespace caffe
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@ -9,6 +9,7 @@
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/net.hpp"
+#include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"

 #include "caffe/test/test_caffe_main.hpp"
@ -29,6 +30,17 @@ class NetTest : public MultiDeviceTest<TypeParam> {
    net_.reset(new Net<Dtype>(param));
  }

+  virtual void InitNetFromProtoFileWithState(const string& proto,
+      Phase phase = caffe::TRAIN, const int level = 0,
+      const vector<string>* stages = NULL) {
+    NetParameter param;
+    CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
+    string param_file;
+    MakeTempFilename(&param_file);
+    WriteProtoToTextFile(param, param_file);
+    net_.reset(new Net<Dtype>(param_file, phase, level, stages));
+  }
+
  virtual void CopyNetBlobs(const bool copy_diff,
      vector<shared_ptr<Blob<Dtype> > >* blobs_copy) {
    CHECK(net_);
@ -771,6 +783,62 @@ class NetTest : public MultiDeviceTest<TypeParam> {
    InitNetFromProtoString(proto);
  }

+  virtual void InitAllInOneNet(Phase phase = caffe::TRAIN,
+      const int level = 0, const vector<string>* stages = NULL) {
+    string proto =
+      "name: 'All-in-one Network'"
+      "layer { "
+      "  name: 'train-data' "
+      "  type: 'DummyData' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "  dummy_data_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "    shape { dim: 1 dim: 1 } "
+      "  } "
+      "  include { phase: TRAIN stage: 'train' } "
+      "} "
+      "layer { "
+      "  name: 'val-data' "
+      "  type: 'DummyData' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "  dummy_data_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "    shape { dim: 1 dim: 1 } "
+      "  } "
+      "  include { phase: TEST stage: 'val' } "
+      "} "
+      "layer { "
+      "  name: 'deploy-data' "
+      "  type: 'Input' "
+      "  top: 'data' "
+      "  input_param { "
+      "    shape { dim: 1 dim: 10 } "
+      "  } "
+      "  include { phase: TEST stage: 'deploy' } "
+      "} "
+      "layer { "
+      "  name: 'ip' "
+      "  type: 'InnerProduct' "
+      "  bottom: 'data' "
+      "  top: 'ip' "
+      "  inner_product_param { "
+      "    num_output: 2 "
+      "  } "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'ip' "
+      "  bottom: 'label' "
+      "  top: 'loss' "
+      "  include { phase: TRAIN stage: 'train' } "
+      "  include { phase: TEST stage: 'val' } "
+      "} ";
+    InitNetFromProtoFileWithState(proto, phase, level, stages);
+  }
+
  int seed_;
  shared_ptr<Net<Dtype> > net_;
 };
@ -2473,4 +2541,64 @@ TYPED_TEST(NetTest, TestForcePropagateDown) {
  }
 }

+TYPED_TEST(NetTest, TestAllInOneNetTrain) {
+  vector<string> stages;
+  stages.push_back("train");
+  this->InitAllInOneNet(caffe::TRAIN, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "train-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetVal) {
+  vector<string> stages;
+  stages.push_back("val");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  bool found_loss = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "val-data") {
+      found_data = true;
+    } else if (layer_name == "loss") {
+      found_loss = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "deploy-data");
+    }
+  }
+  ASSERT_TRUE(found_data);
+  ASSERT_TRUE(found_loss);
+}
+
+TYPED_TEST(NetTest, TestAllInOneNetDeploy) {
+  vector<string> stages;
+  stages.push_back("deploy");
+  this->InitAllInOneNet(caffe::TEST, 0, &stages);
+  bool found_data = false;
+  for (int i = 0; i < this->net_->layers().size(); ++i) {
+    const string& layer_name = this->net_->layer_names()[i];
+    if (layer_name == "deploy-data") {
+      found_data = true;
+    } else {
+      ASSERT_NE(layer_name, "train-data");
+      ASSERT_NE(layer_name, "val-data");
+      ASSERT_NE(layer_name, "loss");
+    }
+  }
+  ASSERT_TRUE(found_data);
+}
+
 }  // namespace caffe
--- a/src/caffe/test/test_rnn_layer.cpp
+++ b/src/caffe/test/test_rnn_layer.cpp
@ -0,0 +1,217 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/rnn_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_cont_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    blob_bottom_static_.Reshape(num_instances, 2, 3, 4);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_cont_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_cont_;
+  Blob<Dtype> blob_bottom_static_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the cont blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all cont blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_cont_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroCont) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroContBufferSize2WithStaticInput) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  filler.Fill(&this->blob_bottom_static_);
+  this->blob_bottom_vec_.push_back(&this->blob_bottom_static_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_cont_.count(); ++i) {
+    this->blob_bottom_cont_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
+}  // namespace caffe
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@ -15,7 +15,7 @@ namespace caffe { namespace db {
 void LMDB::Open(const string& source, Mode mode) {
  MDB_CHECK(mdb_env_create(&mdb_env_));
  if (mode == NEW) {
-    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << " failed";
  }
  int flags = 0;
  if (mode == READ) {
@ -67,36 +67,42 @@ void LMDBTransaction::Commit() {
  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi));

-  bool out_of_memory = false;
  for (int i = 0; i < keys.size(); i++) {
    mdb_key.mv_size = keys[i].size();
    mdb_key.mv_data = const_cast<char*>(keys[i].data());
    mdb_data.mv_size = values[i].size();
    mdb_data.mv_data = const_cast<char*>(values[i].data());

+    // Add data to the transaction
    int put_rc = mdb_put(mdb_txn, mdb_dbi, &mdb_key, &mdb_data, 0);
    if (put_rc == MDB_MAP_FULL) {
-      out_of_memory = true;
-      break;
-    } else {
-      // Failed for some other reason
-      MDB_CHECK(put_rc);
+      // Out of memory - double the map size and retry
+      mdb_txn_abort(mdb_txn);
+      mdb_dbi_close(mdb_env_, mdb_dbi);
+      DoubleMapSize();
+      Commit();
+      return;
    }
+    // May have failed for some other reason
+    MDB_CHECK(put_rc);
  }

-  if (!out_of_memory) {
-    // Commit the transaction
-    MDB_CHECK(mdb_txn_commit(mdb_txn));
-    mdb_dbi_close(mdb_env_, mdb_dbi);
-    keys.clear();
-    values.clear();
-  } else {
-    // Double the map size and retry
-    mdb_txn_abort(mdb_txn);
+  // Commit the transaction
+  int commit_rc = mdb_txn_commit(mdb_txn);
+  if (commit_rc == MDB_MAP_FULL) {
+    // Out of memory - double the map size and retry
    mdb_dbi_close(mdb_env_, mdb_dbi);
    DoubleMapSize();
    Commit();
+    return;
  }
+  // May have failed for some other reason
+  MDB_CHECK(commit_rc);
+
+  // Cleanup after successful commit
+  mdb_dbi_close(mdb_env_, mdb_dbi);
+  keys.clear();
+  values.clear();
 }

 void LMDBTransaction::DoubleMapSize() {
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@ -34,6 +34,13 @@ DEFINE_string(solver, "",
    "The solver definition protocol buffer text file.");
 DEFINE_string(model, "",
    "The model definition protocol buffer text file.");
+DEFINE_string(phase, "",
+    "Optional; network phase (TRAIN or TEST). Only used for 'time'.");
+DEFINE_int32(level, 0,
+    "Optional; network level.");
+DEFINE_string(stage, "",
+    "Optional; network stages (not to be confused with phase), "
+    "separated by ','.");
 DEFINE_string(snapshot, "",
    "Optional; the snapshot solver state to resume training.");
 DEFINE_string(weights, "",
@ -101,6 +108,25 @@ static void get_gpus(vector<int>* gpus) {
  }
 }

+// Parse phase from flags
+caffe::Phase get_phase_from_flags(caffe::Phase default_value) {
+  if (FLAGS_phase == "")
+    return default_value;
+  if (FLAGS_phase == "TRAIN")
+    return caffe::TRAIN;
+  if (FLAGS_phase == "TEST")
+    return caffe::TEST;
+  LOG(FATAL) << "phase must be \"TRAIN\" or \"TEST\"";
+  return caffe::TRAIN;  // Avoid warning
+}
+
+// Parse stages from flags
+vector<string> get_stages_from_flags() {
+  vector<string> stages;
+  boost::split(stages, FLAGS_stage, boost::is_any_of(","));
+  return stages;
+}
+
 // caffe commands to call by
 //     caffe <command> <args>
 //
@ -157,10 +183,16 @@ int train() {
  CHECK(!FLAGS_snapshot.size() || !FLAGS_weights.size())
      << "Give a snapshot to resume training or weights to finetune "
      "but not both.";
+  vector<string> stages = get_stages_from_flags();

  caffe::SolverParameter solver_param;
  caffe::ReadSolverParamsFromTextFileOrDie(FLAGS_solver, &solver_param);

+  solver_param.mutable_train_state()->set_level(FLAGS_level);
+  for (int i = 0; i < stages.size(); i++) {
+    solver_param.mutable_train_state()->add_stage(stages[i]);
+  }
+
  // If the gpus flag is not provided, allow the mode and device to be set
  // in the solver prototxt.
  if (FLAGS_gpu.size() == 0
@ -230,6 +262,7 @@ RegisterBrewFunction(train);
 int test() {
  CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to score.";
  CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
+  vector<string> stages = get_stages_from_flags();

  // Set device id and mode
  vector<int> gpus;
@ -248,7 +281,7 @@ int test() {
    Caffe::set_mode(Caffe::CPU);
  }
  // Instantiate the caffe net.
-  Net<float> caffe_net(FLAGS_model, caffe::TEST);
+  Net<float> caffe_net(FLAGS_model, caffe::TEST, FLAGS_level, &stages);
  caffe_net.CopyTrainedLayersFrom(FLAGS_weights);
  LOG(INFO) << "Running for " << FLAGS_iterations << " iterations.";

@ -301,6 +334,8 @@ RegisterBrewFunction(test);
 // Time: benchmark the execution time of a model.
 int time() {
  CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time.";
+  caffe::Phase phase = get_phase_from_flags(caffe::TRAIN);
+  vector<string> stages = get_stages_from_flags();

  // Set device id and mode
  vector<int> gpus;
@ -314,7 +349,7 @@ int time() {
    Caffe::set_mode(Caffe::CPU);
  }
  // Instantiate the caffe net.
-  Net<float> caffe_net(FLAGS_model, caffe::TRAIN);
+  Net<float> caffe_net(FLAGS_model, phase, FLAGS_level, &stages);

  // Do a clean forward and backward pass, so that memory allocation are done
  // and future iterations will be more stable.
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@ -73,10 +73,13 @@ int main(int argc, char** argv) {

  std::ifstream infile(argv[2]);
  std::vector<std::pair<std::string, int> > lines;
-  std::string filename;
+  std::string line;
+  size_t pos;
  int label;
-  while (infile >> filename >> label) {
-    lines.push_back(std::make_pair(filename, label));
+  while (std::getline(infile, line)) {
+    pos = line.find_last_of(' ');
+    label = atoi(line.substr(pos + 1).c_str());
+    lines.push_back(std::make_pair(line.substr(0, pos), label));
  }
  if (FLAGS_shuffle) {
    // randomly shuffle data
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@ -16,13 +16,10 @@ from collections import OrderedDict

 def parse_log(path_to_log):
    """Parse log file
-    Returns (train_dict_list, train_dict_names, test_dict_list, test_dict_names)
+    Returns (train_dict_list, test_dict_list)

    train_dict_list and test_dict_list are lists of dicts that define the table
    rows
-
-    train_dict_names and test_dict_names are ordered tuples of the column names
-    for the two dict_lists
    """

    regex_iteration = re.compile('Iteration (\d+)')
--- a/tools/extract_features.cpp
+++ b/tools/extract_features.cpp
@ -130,7 +130,7 @@ int feature_extraction_pipeline(int argc, char** argv) {
    txns.push_back(txn);
  }

-  LOG(ERROR)<< "Extacting Features";
+  LOG(ERROR)<< "Extracting Features";

  Datum datum;
  std::vector<int> image_indices(num_features, 0);