Merge pull request #167 from BVLC/next

So be it.
2014-02-26 15:32:42 -08:00 · 2014-02-26 15:32:42 -08:00 · 9da7bcb297
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+## General
+
 # Compiled Object files
 *.slo
 *.lo
@ -19,25 +21,38 @@
 *.pb.cc
 *_pb2.py

-# bin files
+# Compiled python
+*.pyc
+
+# Compiled MATLAB
+*.mex
+*.mexa64
+*.mexmaci64
+
+# build, distribute, and bins
+build/*
+distribute/*
 *.testbin
 *.bin

-# vim swp files
+# Editor temporaries
 *.swp
-
-# matlab binary
-*.mexa64
+*~

 # IPython notebook checkpoints
 .ipynb_checkpoints

-# anything under data/ unless we force include them
-data/*
+## Caffe

-# anything under distribute
-distribute/*
-
-# user's specified config
+# User's build configuration
 Makefile.config
-docs/_site
+
+# Models, Data, and Examples are either
+# 1. reference, and not casually committed
+# 2. custom, and live on their own unless they're deliberated contributed
+models/*
+data/*
+examples/*
+
+# Don't version the generated documentation
+docs/_site
--- a/28
+++ b/28
@ -26,6 +26,8 @@ TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp")
 GTEST_SRC := src/gtest/gtest-all.cpp
 # TEST_HDRS are the test header files
 TEST_HDRS := $(shell find src/$(PROJECT) -name "test_*.hpp")
+# TOOL_SRCS are the source files for the tool binaries
+TOOL_SRCS := $(shell find tools -name "*.cpp")
 # EXAMPLE_SRCS are the source files for the example binaries
 EXAMPLE_SRCS := $(shell find examples -name "*.cpp")
 # PROTO_SRCS are the protocol buffer definitions
@ -46,16 +48,18 @@ PROTO_GEN_CC := ${PROTO_SRCS:.proto=.pb.cc}
 PROTO_GEN_PY := ${PROTO_SRCS:.proto=_pb2.py}
 # The objects corresponding to the source files
 # These objects will be linked into the final shared library, so we
-# exclude the test and example objects.
+# exclude the tool, example, and test objects.
 CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o})
 CU_OBJS := $(addprefix $(BUILD_DIR)/, ${CU_SRCS:.cu=.cuo})
 PROTO_OBJS := $(addprefix $(BUILD_DIR)/, ${PROTO_GEN_CC:.cc=.o})
 OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS)
-# program and test objects
+# tool, example, and test objects
+TOOL_OBJS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o})
 EXAMPLE_OBJS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o})
 TEST_OBJS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o})
 GTEST_OBJ := $(addprefix $(BUILD_DIR)/, ${GTEST_SRC:.cpp=.o})
-# program and test bins
+# tool, example, and test bins
+TOOL_BINS := ${TOOL_OBJS:.o=.bin}
 EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin}
 TEST_BINS := ${TEST_OBJS:.o=.testbin}

@ -86,13 +90,14 @@ PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 ##############################
 # Define build targets
 ##############################
-.PHONY: all init test clean linecount examples py mat distribute py$(PROJECT) mat$(PROJECT) proto
+.PHONY: all init test clean linecount tools examples py mat distribute py$(PROJECT) mat$(PROJECT) proto

-all: init $(NAME) $(STATIC_NAME) examples
+all: init $(NAME) $(STATIC_NAME) tools examples
 	@echo $(CXX_OBJS)

 init:
 	@ mkdir -p $(foreach obj,$(OBJS),$(dir $(obj)))
+	@ mkdir -p $(foreach obj,$(TOOL_OBJS),$(dir $(obj)))
 	@ mkdir -p $(foreach obj,$(EXAMPLE_OBJS),$(dir $(obj)))
 	@ mkdir -p $(foreach obj,$(TEST_OBJS),$(dir $(obj)))
 	@ mkdir -p $(foreach obj,$(GTEST_OBJ),$(dir $(obj)))
@ -102,6 +107,8 @@ linecount: clean

 test: init $(TEST_BINS)

+tools: init $(TOOL_BINS)
+
 examples: init $(EXAMPLE_BINS)

 py$(PROJECT): py
@ -134,6 +141,10 @@ runtest: test
 $(TEST_BINS): %.testbin : %.o $(GTEST_OBJ) $(STATIC_NAME) $(TEST_HDRS)
 	$(CXX) $< $(GTEST_OBJ) $(STATIC_NAME) -o $@ $(CXXFLAGS) $(LDFLAGS) $(WARNINGS)

+$(TOOL_BINS): %.bin : %.o $(STATIC_NAME)
+	$(CXX) $< $(STATIC_NAME) -o $@ $(CXXFLAGS) $(LDFLAGS) $(WARNINGS)
+	@echo
+
 $(EXAMPLE_BINS): %.bin : %.o $(STATIC_NAME)
 	$(CXX) $< $(STATIC_NAME) -o $@ $(CXXFLAGS) $(LDFLAGS) $(WARNINGS)
 	@echo
@ -172,6 +183,10 @@ $(BUILD_DIR)/src/$(PROJECT)/util/%.cuo: src/$(PROJECT)/util/%.cu
 	$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
 	@echo

+$(BUILD_DIR)/tools/%.o: tools/%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@ $(LDFLAGS)
+	@echo
+
 $(BUILD_DIR)/examples/%.o: examples/%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@ $(LDFLAGS)
 	@echo
@ -201,8 +216,9 @@ distribute: all
 	mkdir $(DISTRIBUTE_DIR)
 	# add include
 	cp -r include $(DISTRIBUTE_DIR)/
-	# add example binaries
+	# add tool and example binaries
 	mkdir $(DISTRIBUTE_DIR)/bin
+	cp $(TOOL_BINS) $(DISTRIBUTE_DIR)/bin
 	cp $(EXAMPLE_BINS) $(DISTRIBUTE_DIR)/bin
 	# add libraries
 	mkdir $(DISTRIBUTE_DIR)/lib
--- a/data/cifar10/get_cifar10.sh
+++ b/data/cifar10/get_cifar10.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env sh
+# This scripts downloads the CIFAR10 (binary version) data and unzips it.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
+
+echo "Downloading..."
+
+wget -q http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
+
+echo "Unzipping..."
+
+tar -xf cifar-10-binary.tar.gz && rm -f cifar-10-binary.tar.gz
+mv cifar-10-batches-bin/* . && rm -rf cifar-10-batches-bin
+
+# Creation is split out because leveldb sometimes causes segfault
+# and needs to be re-created.
+
+echo "Done."
--- a/data/create_mnist.sh
+++ b/data/create_mnist.sh
@ -1,12 +0,0 @@
-#!/usr/bin/env sh
-# This script converts the mnist data into leveldb format.
-
-echo "Creating leveldb..."
-
-rm -rf mnist-train-leveldb
-rm -rf mnist-test-leveldb
-
-../build/examples/convert_mnist_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte mnist-train-leveldb
-../build/examples/convert_mnist_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist-test-leveldb
-
-echo "Done."
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@ -0,0 +1,20 @@
+#!/usr/bin/env sh
+#
+# N.B. This does not download the ilsvrcC12 data set, as it is gargantuan.
+# This script downloads the imagenet example auxiliary files including:
+# - the ilsvrc12 image mean, binaryproto
+# - synset ids and words
+# - the training splits with labels
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
+
+echo "Downloading..."
+
+wget -q https://www.dropbox.com/s/g5myor4y2scdv95/caffe_ilsvrc12.tar.gz
+
+echo "Unzipping..."
+
+tar -xf caffe_ilsvrc12.tar.gz && rm -f caffe_ilsvrc12.tar.gz
+
+echo "Done."
--- a/data/mnist/get_mnist.sh
+++ b/data/mnist/get_mnist.sh
@ -1,6 +1,9 @@
 #!/usr/bin/env sh
 # This scripts downloads the mnist data and unzips it.

+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
+
 echo "Downloading..."

 wget -q http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
--- a/data/train_mnist.sh
+++ b/data/train_mnist.sh
@ -1,3 +0,0 @@
-#!/usr/bin/env sh
-
-GLOG_logtostderr=1 ../build/examples/train_net.bin lenet_solver.prototxt
--- a/docs/imagenet_pretrained.md
+++ b/docs/imagenet_pretrained.md
@ -6,33 +6,23 @@ title: Caffe
 Running Pretrained ImageNet
 ===========================

-[View this page as an IPython Notebook](http://nbviewer.ipython.org/url/caffe.berkeleyvision.org/imagenet_pretrained_files/imagenet_pretrained.ipynb)
+[View this page as an IPython Notebook](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/imagenet_pretrained.ipynb)

-For easier use of pretrained models, we provide a wrapper specifically written
-for the case of ImageNet, so one can take an image and directly compute features
-or predictions from them. Both Python and Matlab wrappers are provided. We will
-describe the use of the Python wrapper here, and the Matlab wrapper usage is
-very similar.
+For easier use of pretrained models, we provide a wrapper specifically written for the case of ImageNet, so one can take an image and directly compute features or predictions from them. Both Python and Matlab wrappers are provided. We will describe the use of the Python wrapper here, and the Matlab wrapper usage is very similar.

-We assume that you have successfully compiled Caffe and set the correct
-`PYTHONPATH`. If not, please refer to the [installation
-instructions](installation.html). You will use our pre-trained imagenet model,
-which you can
-[download here](https://www.dropbox.com/s/n3jups0gr7uj0dv/caffe_reference_imagenet_model)
-(232.57MB). Note that this pre-trained model is licensed for academic research /
-non-commercial use only.
+We assume that you have successfully compiled Caffe and set the correct `PYTHONPATH`. If not, please refer to the [installation instructions](installation.html). You will use our pre-trained imagenet model, which you can download (232.57MB) by running `models/get_caffe_reference_imagenet_model.sh`.Note that this pre-trained model is licensed for academic research / non-commercial use only.

 Ready? Let's start.


    from caffe import imagenet
    from matplotlib import pyplot
-    
+
    # Set the right path to your model file, pretrained model,
    # and the image you would like to classify.
-    MODEL_FILE = 'examples/imagenet_deploy.prototxt'
-    PRETRAINED = '/home/jiayq/Downloads/caffe_reference_imagenet_model'
-    IMAGE_FILE = '/home/jiayq/lena.png'
+    MODEL_FILE = 'models/imagenet.prototxt'
+    PRETRAINED = 'models/caffe_reference_imagenet_model'
+    IMAGE_FILE = '/path/to/lena.png'

 Loading a network is easy. imagenet.ImagenetClassifier wraps everything. In
 default, the classifier will crop the center and corners of an image, as well as
--- a/docs/imagenet_pretrained_files/imagenet_pretrained.ipynb
+++ b/docs/imagenet_pretrained_files/imagenet_pretrained.ipynb
--- a/docs/imagenet_training.md
+++ b/docs/imagenet_training.md
@ -7,11 +7,11 @@ Yangqing's Recipe on Brewing ImageNet
 =====================================

    "All your braincells are belong to us."
-        - Starbucks
+        - Caffeine

-We are going to describe a reference implementation for the approach first proposed by Krizhevsky, Sutskever, and Hinton in their [NIPS 2012 paper](http://books.nips.cc/papers/files/nips25/NIPS2012_0534.pdf). Since training the whole model takes quite some time and energy, we also provide a model, trained in the same way as we describe here, to help fight global warming. If you would like to simply use the pretrained model, check out the [Pretrained ImageNet](imagenet_pretrained.html) page.
+We are going to describe a reference implementation for the approach first proposed by Krizhevsky, Sutskever, and Hinton in their [NIPS 2012 paper](http://books.nips.cc/papers/files/nips25/NIPS2012_0534.pdf). Since training the whole model takes some time and energy, we provide a model, trained in the same way as we describe here, to help fight global warming. If you would like to simply use the pretrained model, check out the [Pretrained ImageNet](imagenet_pretrained.html) page. *Note that the pretrained model is for academic research / non-commercial use only*.

-To clarify, by ImageNet we actually mean the ILSVRC challenge, but you can easily train on the whole imagenet as well, just more disk space, and a little longer training time.
+To clarify, by ImageNet we actually mean the ILSVRC12 challenge, but you can easily train on the whole of ImageNet as well, just with more disk space, and a little longer training time.

 (If you don't get the quote, visit [Yann LeCun's fun page](http://yann.lecun.com/ex/fun/).

@ -23,7 +23,12 @@ We assume that you already have downloaded the ImageNet training data and valida
    /path/to/imagenet/train/n01440764/n01440764_10026.JPEG
    /path/to/imagenet/val/ILSVRC2012_val_00000001.JPEG

-You will first need to create a text file listing all the files as well as their labels. An example could be found in the caffe repo at `python/caffe/imagenet/ilsvrc_2012_train.txt` and `ilsvrc_2012_val.txt`. Note that in those two files we used a different indexing from the ILSVRC devkit: we sorted the synset names in their ASCII order, and then labeled them from 0 to 999.
+You will first need to prepare some auxiliary data for training. This data can be downloaded by:
+
+    cd $CAFFE_ROOT/data/ilsvrc12/
+    ./get_ilsvrc12_aux.sh
+
+The training and validation input are described in `train.txt` and `val.txt` as text listing all the files and their labels. Note that we use a different indexing for labels than the ILSVRC devkit: we sort the synset names in their ASCII order, and then label them from 0 to 999. See `synset_words.txt` for the synset/name mapping.

 You will also need to resize the images to 256x256: we do not explicitly do this because in a cluster environment, one may benefit from resizing images in a parallel fashion, using mapreduce. For example, Yangqing used his lightedweighted [mincepie](https://github.com/Yangqing/mincepie) package to do mapreduce on the Berkeley cluster. If you would things to be rather simple and straightforward, you can also use shell commands, something like:

@ -31,34 +36,30 @@ You will also need to resize the images to 256x256: we do not explicitly do this
        convert -resize 256x256\! $name $name
    done

-Now, you can simply create a leveldb using commands as follows:
+Go to `$CAFFE_ROOT/examples/imagenet/` for the rest of this guide.

-    GLOG_logtostderr=1 examples/convert_imageset.bin \
-        /path/to/imagenet/train/ \
-        python/caffe/imagenet/ilsvrc_2012_train.txt \
-        /path/to/imagenet-train-leveldb 1
-
-Note that `/path/to/imagenet-train-leveldb` should not exist before this execution. It will be created by the script. `GLOG_logtostderr=1` simply dumps more information for you to inspect, and you can safely ignore it.
+Take a look at `create_imagenet.sh`. Set the paths to the train and val dirs as needed. Now simply create the leveldbs with `./create_imagenet.sh`. Note that `imagenet_train_leveldb` and `imagenet_val_leveldb` should not exist before this execution. It will be created by the script. `GLOG_logtostderr=1` simply dumps more information for you to inspect, and you can safely ignore it.

 Compute Image Mean
 ------------------

-The Model requires us to subtract the image mean from each image, so we have to compute the mean. `examples/demo_compute_image_mean.cpp` implements that - it is also a good example to familiarize yourself on how to manipulate the multiple components, such as protocol buffers, leveldbs, and logging, if you are not familiar with it. Anyway, the mean computation can be carried out as:
+The model requires us to subtract the image mean from each image, so we have to compute the mean. `tools/compute_image_mean.cpp` implements that - it is also a good example to familiarize yourself on how to manipulate the multiple components, such as protocol buffers, leveldbs, and logging, if you are not familiar with them. Anyway, the mean computation can be carried out as:

-    examples/demo_compute_image_mean.bin /path/to/imagenet-train-leveldb /path/to/mean.binaryproto
+    ./make_imagenet_mean.sh

-where `/path/to/mean.binaryproto` will be created by the program.
+which will make `data/ilsvrc12/imagenet_mean.binaryproto`.

 Network Definition
 ------------------
-The network definition follows strictly the one in Krizhevsky et al. You can find the detailed definition at `examples/imagenet.prototxt`. Note that to run it, you will most likely need to change the paths in the data layer - change the following lines

-    source: "/home/jiayq/Data/ILSVRC12/train-leveldb"
-    meanfile: "/home/jiayq/Data/ILSVRC12/image_mean.binaryproto"
+The network definition follows strictly the one in Krizhevsky et al. You can find the detailed definition at `examples/imagenet/imagenet.prototxt`. Note that the paths in the data layer - if you have not followed the exact paths in this guide you will need to change the following lines:

-to point to your own leveldb and image mean. Likewise, do the same for `examples/imagenet_val.prototxt`.
+    source: "ilvsrc12_train_leveldb"
+    meanfile: "../../data/ilsvrc12/imagenet_mean.binaryproto"

-If you look carefully at `imagenet.prototxt` and `imagenet_val.prototxt`, you will notice that they are largely the same, with the only difference being the data layer sources, and the last layer: in training, we will be using a `softmax_loss` layer to compute the loss function and to initialize the backpropagation, while in validation we will be using an `accuracy` layer to inspect how well we do in terms of accuracy.
+to point to your own leveldb and image mean. Likewise, do the same for `examples/imagenet/imagenet_val.prototxt`.
+
+If you look carefully at `imagenet_train.prototxt` and `imagenet_val.prototxt`, you will notice that they are largely the same, with the only difference being the data layer sources, and the last layer: in training, we will be using a `softmax_loss` layer to compute the loss function and to initialize the backpropagation, while in validation we will be using an `accuracy` layer to inspect how well we do in terms of accuracy.

 We will also lay out a protocol buffer for running the solver. Let's make a few plans:
 * We will run in batches of 256, and run a total of 4,500,000 iterations (about 90 epochs).
@ -68,19 +69,19 @@ We will also lay out a protocol buffer for running the solver. Let's make a few
 * The network will be trained with momentum 0.9 and a weight decay of 0.0005.
 * For every 10,000 iterations, we will take a snapshot of the current status.

-Sounds good? This is implemented in `examples/imagenet_solver.prototxt`. Again, you will need to change the first two lines:
+Sound good? This is implemented in `examples/imagenet/imagenet_solver.prototxt`. Again, you will need to change the first two lines:

-    train_net: "examples/imagenet.prototxt"
-    test_net: "examples/imagenet_val.prototxt"
+    train_net: "imagenet_train.prototxt"
+    test_net: "imagenet_val.prototxt"

-to point to the actual path.
+to point to the actual path if you have changed them.

 Training ImageNet
 -----------------

 Ready? Let's train.

-    GLOG_logtostderr=1 examples/train_net.bin examples/imagenet_solver.prototxt
+    ./train_imagenet.sh

 Sit back and enjoy! On my K20 machine, every 20 iterations take about 36 seconds to run, so effectively about 7 ms per image for the full forward-backward pass. About 2.5 ms of this is on forward, and the rest is backward. If you are interested in dissecting the computation time, you can look at `examples/net_speed_benchmark.cpp`, but it was written purely for debugging purpose, so you may need to figure a few things out yourself.

@ -89,13 +90,13 @@ Resume Training?

 We all experience times when the power goes out, or we feel like rewarding ourself a little by playing Battlefield (does someone still remember Quake?). Since we are snapshotting intermediate results during training, we will be able to resume from snapshots. This can be done as easy as:

-    GLOG_logtostderr=1 examples/train_net.bin examples/imagenet_solver.prototxt caffe_imagenet_train_10000.solverstate
+    ./resume_training.sh

-where `caffe_imagenet_train_1000.solverstate` is the solver state snapshot that stores all necessary information to recover the exact solver state (including the parameters, momentum history, etc).
+where in the script `caffe_imagenet_train_1000.solverstate` is the solver state snapshot that stores all necessary information to recover the exact solver state (including the parameters, momentum history, etc).

 Parting Words
 -------------

-Hope you liked this recipe. Many researchers have gone further since the ILSVRC 2012 challenge, changing the network architecture and/or finetuning the various parameters in the network. The recent ILSVRC 2013 challenge suggests that there are quite some room for improvement. **Caffe allows one to explore different network choices  more easily, by simply writing different prototxt files** - isn't that exciting?
+Hope you liked this recipe! Many researchers have gone further since the ILSVRC 2012 challenge, changing the network architecture and/or finetuning the various parameters in the network. The recent ILSVRC 2013 challenge suggests that there are quite some room for improvement. **Caffe allows one to explore different network choices  more easily, by simply writing different prototxt files** - isn't that exciting?

-And since now you have a trained network, check out how to use it: [Running Pretrained ImageNet](imagenet_pretrained.html). This time we will use Python, but if you have wrappers for other languages, please kindly send me a pull request!
+And since now you have a trained network, check out how to use it: [Running Pretrained ImageNet](imagenet_pretrained.html). This time we will use Python, but if you have wrappers for other languages, please kindly send a pull request!
--- a/docs/index.md
+++ b/docs/index.md
@ -33,7 +33,7 @@ Quick Links
 * [Presentation](https://docs.google.com/presentation/d/1lzyXMRQFlOYE2Jy0lCNaqltpcCIKuRzKJxQ7vCuPRc8/edit?usp=sharing): Presentation on Caffe at the UC Berkeley Vision Group meeting.
 * [Installation](installation.html): Instructions on installing Caffe (tested on Ubuntu 12.04, but works on Red Hat, OS X, etc.).
 * [MNIST Demo](mnist.html): example of end-to-end training and testing on the MNIST data.
-* [Training ImageNet](imagenet.html): tutorial on end-to-end training of an ImageNet classifier.
+* [Training ImageNet](imagenet_training.html): tutorial on end-to-end training of an ImageNet classifier.
 * [Running Pretrained ImageNet](imagenet_pretrained.html): simply runs in Python!
 * [Running Detection](imagenet_detection.html): run a pretrained model as a detector.

--- a/docs/mnist.md
+++ b/docs/mnist.md
@ -13,8 +13,10 @@ Prepare Datasets

 You will first need to download and convert the data format from the MNIST website. To do this, simply run the following commands:

-    cd $CAFFE_ROOT/data
+    cd $CAFFE_ROOT/data/mnist
    ./get_mnist.sh
+    cd $CAFFE_ROOT/examples/lenet
+    ./create_mnist.sh

 If it complains that `wget` or `gunzip` are not installed, you need to install them respectively. After running the script there should be two datasets, `CAFFE_ROOT/data/mnist-train-leveldb`, and `CAFFE_ROOT/data/mnist-test-leveldb`.

@ -31,10 +33,10 @@ Training and Testing the Model

 Training the model is simple after you have written the network definition protobuf and solver protobuf files. Simply run `train_mnist.sh`, or the following command directly:

-    cd $CAFFE_ROOT/data
-    GLOG_logtostderr=1 ../examples/train_net.bin lenet_solver.prototxt
+    cd $CAFFE_ROOT/examples/lenet
+    ./train_lenet.sh

-A few explanations: `GLOG_logtostderr=1` is the google logging flag that prints all the logging messages directly to stderr. The main executable for training is `examples/train_net.bin`, with the solver protobuf text file as its argument.
+`train_lenet.sh` is a simple script, but here are a few explanations: `GLOG_logtostderr=1` is the google logging flag that prints all the logging messages directly to stderr. The main tool for training is `train_net.bin`, with the solver protobuf text file as its argument.

 When you run the code, you will see a lot of messages flying by like this:

@ -79,7 +81,7 @@ which you can deploy as a trained model in your application, if you are training
 Um... How about GPU training?
 -----------------------------

-You just did! All the training were carried out on the GPU. In fact, if you would like to do training on CPU, you can simply change one line in `lenet_solver.prototxt`:
+You just did! All the training was carried out on the GPU. In fact, if you would like to do training on CPU, you can simply change one line in `lenet_solver.prototxt`:

    # solver mode: 0 for CPU and 1 for GPU
    solver_mode: 0
--- a/docs/mnist_prototxt.md
+++ b/docs/mnist_prototxt.md
@ -6,7 +6,7 @@ title: Caffe
 Define the MNIST Network
 =========================

-This page explains the prototxt file used in the MNIST demo. We assume that you are familiar with [Google Protobuf](https://developers.google.com/protocol-buffers/docs/overview), and assume that you have read the protobuf definitions used by Caffe, which can be found at [src/caffe/proto/caffe.proto](https://github.com/Yangqing/caffe/blob/master/src/caffe/proto/caffe.proto).
+This page explains the prototxt file `lenet_train.prototxt` used in the MNIST demo. We assume that you are familiar with [Google Protobuf](https://developers.google.com/protocol-buffers/docs/overview), and assume that you have read the protobuf definitions used by Caffe, which can be found at [src/caffe/proto/caffe.proto](https://github.com/Yangqing/caffe/blob/master/src/caffe/proto/caffe.proto).

 Specifically, we will write a `caffe::NetParameter` (or in python, `caffe.proto.caffe_pb2.NetParameter`) protubuf. We will start by giving the network a name:

--- a/docs/mnist_solver_prototxt.md
+++ b/docs/mnist_solver_prototxt.md
@ -9,7 +9,7 @@ Define the MNIST Solver
 The page is under construction. For now, check out the comments in the solver prototxt file, which explains each line in the prototxt:

    # The training protocol buffer definition
-    train_net: "lenet.prototxt"
+    train_net: "lenet_train.prototxt"
    # The testing protocol buffer definition
    test_net: "lenet_test.prototxt"
    # test_iter specifies how many forward passes the test should carry out.
--- a/docs/selective_search_demo.ipynb
+++ b/docs/selective_search_demo.ipynb
--- a/examples/cifar/TODO.md
+++ b/examples/cifar/TODO.md
@ -0,0 +1,4 @@
+# CIFAR-10
+
+Contributing a CIFAR-10 example would be welcome! A benchmark against
+cuda-convnet could be interesting too.
--- a/examples/cifar/convert_cifar_data.cpp
+++ b/examples/cifar/convert_cifar_data.cpp
@ -1,11 +1,11 @@
 // Copyright Yangqing Jia 2013
 //
-// This script converts the MNIST dataset to the leveldb format used
+// This script converts the CIFAR dataset to the leveldb format used
 // by caffe to perform classification.
 // Usage:
-//    convert_mnist_data input_image_file input_label_file output_db_file
-// The MNIST dataset could be downloaded at
-//    http://yann.lecun.com/exdb/mnist/
+//    convert_cifar_data input_folder output_db_file
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html

 #include <google/protobuf/text_format.h>
 #include <glog/logging.h>
--- a/examples/imagenet/create_imagenet.sh
+++ b/examples/imagenet/create_imagenet.sh
@ -0,0 +1,20 @@
+#!/usr/bin/env sh
+# Create the imagenet leveldb inputs
+# N.B. set the path to the imagenet train + val data dirs
+
+$TOOLS=../../build/tools
+$DATA=../../data/ilsvrc12
+
+echo "Creating leveldb..."
+
+GLOG_logtostderr=1 $TOOLS/convert_imageset.bin \
+    /path/to/imagenet/train/ \
+    $DATA/train.txt \
+    imagenet_train_leveldb 1
+
+GLOG_logtostderr=1 $TOOLS/convert_imageset.bin \
+    /path/to/imagenet/val/ \
+    $DATA/val.txt \
+    imagenet_val_leveldb 1
+
+echo "Done."
--- a/examples/imagenet/imagenet_solver.prototxt
+++ b/examples/imagenet/imagenet_solver.prototxt
@ -1,5 +1,5 @@
-train_net: "examples/imagenet.prototxt"
-test_net: "examples/imagenet_val.prototxt"
+train_net: "imagenet_train.prototxt"
+test_net: "imagenet_val.prototxt"
 test_iter: 1000
 test_interval: 1000
 base_lr: 0.01
--- a/examples/imagenet/imagenet_train.prototxt
+++ b/examples/imagenet/imagenet_train.prototxt
@ -1,8 +1,17 @@
-input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 227
-input_dim: 227
+name: "CaffeNet"
+layers {
+  layer {
+    name: "data"
+    type: "data"
+    source: "ilvsrc12_train_leveldb"
+    meanfile: "../../data/ilsvrc12/imagenet_mean.binaryproto"
+    batchsize: 256
+    cropsize: 227
+    mirror: true
+  }
+  top: "data"
+  top: "label"
+}
 layers {
  layer {
    name: "conv1"
@ -347,9 +356,9 @@ layers {
 }
 layers {
  layer {
-    name: "prob"
-    type: "softmax"
+    name: "loss"
+    type: "softmax_loss"
  }
  bottom: "fc8"
-  top: "prob"
+  bottom: "label"
 }
--- a/examples/imagenet/imagenet_val.prototxt
+++ b/examples/imagenet/imagenet_val.prototxt
@ -3,8 +3,8 @@ layers {
  layer {
    name: "data"
    type: "data"
-    source: "/home/jiayq/Data/ILSVRC12/val-leveldb"
-    meanfile: "/home/jiayq/Data/ILSVRC12/image_mean.binaryproto"
+    source: "ilvsrc12_val_leveldb"
+    meanfile: "../../data/ilsvrc12/imagenet_mean.binaryproto"
    batchsize: 50
    cropsize: 227
    mirror: false
--- a/examples/imagenet/make_imagenet_mean.sh
+++ b/examples/imagenet/make_imagenet_mean.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env sh
+# Compute the mean image from the imagenet training leveldb
+# N.B. this is available in data/ilsvrc12
+
+$TOOLS=../../build/tools
+$DATA=../../data/ilsvrc12
+
+$TOOLS/compute_image_mean.bin ilsvrc12_train_leveldb $DATA/imagenet_mean.binaryproto
+
+echo "Done."
--- a/examples/imagenet/resume_training.sh
+++ b/examples/imagenet/resume_training.sh
@ -0,0 +1,8 @@
+#!/usr/bin/env sh
+
+$TOOLS=../../build/tools
+
+GLOG_logtostderr=1 $TOOLS/train_net.bin \
+    imagenet_solver.prototxt caffe_imagenet_train_10000.solverstate
+
+echo "Done."
--- a/examples/imagenet/train_imagenet.sh
+++ b/examples/imagenet/train_imagenet.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env sh
+
+$TOOLS=../../build/tools
+
+GLOG_logtostderr=1 $TOOLS/train_net.bin imagenet_solver.prototxt
+
+echo "Done."
--- a/examples/lenet/convert_mnist_data.cpp
+++ b/examples/lenet/convert_mnist_data.cpp
--- a/examples/lenet/create_mnist.sh
+++ b/examples/lenet/create_mnist.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env sh
+# This script converts the mnist data into leveldb format.
+
+$EXAMPLES=../../build/examples/lenet
+$DATA=../../data/mnist
+
+echo "Creating leveldb..."
+
+rm -rf mnist-train-leveldb
+rm -rf mnist-test-leveldb
+
+$EXAMPLES/convert_mnist_data.bin $DATA/train-images-idx3-ubyte train-labels-idx1-ubyte mnist-train-leveldb
+$EXAMPLES/convert_mnist_data.bin $DATA/t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist-test-leveldb
+
+echo "Done."
--- a/examples/lenet/lenet.prototxt
+++ b/examples/lenet/lenet.prototxt
@ -0,0 +1,117 @@
+name: "LeNet"
+input: "data"
+input_dim: 64
+input_dim: 1
+input_dim: 28
+input_dim: 28
+# N.B. input should be 0/1 = mnist raw data scaled by 0.00390625
+layers {
+  layer {
+    name: "conv1"
+    type: "conv"
+    num_output: 20
+    kernelsize: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+    blobs_lr: 1.
+    blobs_lr: 2.
+  }
+  bottom: "data"
+  top: "conv1"
+}
+layers {
+  layer {
+    name: "pool1"
+    type: "pool"
+    kernelsize: 2
+    stride: 2
+    pool: MAX
+  }
+  bottom: "conv1"
+  top: "pool1"
+}
+layers {
+  layer {
+    name: "conv2"
+    type: "conv"
+    num_output: 50
+    kernelsize: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+    blobs_lr: 1.
+    blobs_lr: 2.
+  }
+  bottom: "pool1"
+  top: "conv2"
+}
+layers {
+  layer {
+    name: "pool2"
+    type: "pool"
+    kernelsize: 2
+    stride: 2
+    pool: MAX
+  }
+  bottom: "conv2"
+  top: "pool2"
+}
+layers {
+  layer {
+    name: "ip1"
+    type: "innerproduct"
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+    blobs_lr: 1.
+    blobs_lr: 2.
+  }
+  bottom: "pool2"
+  top: "ip1"
+}
+layers {
+  layer {
+    name: "relu1"
+    type: "relu"
+  }
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  layer {
+    name: "ip2"
+    type: "innerproduct"
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+    blobs_lr: 1.
+    blobs_lr: 2.
+  }
+  bottom: "ip1"
+  top: "ip2"
+}
+layers {
+  layer {
+    name: "prob"
+    type: "softmax"
+  }
+  bottom: "ip2"
+  top: "prob"
+}
--- a/examples/lenet/lenet_solver.prototxt
+++ b/examples/lenet/lenet_solver.prototxt
@ -1,5 +1,5 @@
 # The training protocol buffer definition
-train_net: "lenet.prototxt"
+train_net: "lenet_train.prototxt"
 # The testing protocol buffer definition
 test_net: "lenet_test.prototxt"
 # test_iter specifies how many forward passes the test should carry out.
--- a/examples/lenet/lenet_test.prototxt
+++ b/examples/lenet/lenet_test.prototxt
--- a/examples/lenet/lenet_train.prototxt
+++ b/examples/lenet/lenet_train.prototxt
--- a/examples/lenet/train_lenet.sh
+++ b/examples/lenet/train_lenet.sh
@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+
+$TOOLS=../../build/tools
+
+GLOG_logtostderr=1 $TOOLS/train_net.bin lenet_solver.prototxt
--- a/examples/selective_search_demo.ipynb
+++ b/examples/selective_search_demo.ipynb
@ -17,8 +17,7 @@
      "\n",
      "Let's run detection on an image of a couple of cats frolicking (one of the ImageNet detection challenge pictures), which we will download from the web. You'll need a prototxt specifying the network, and a trained model.\n",
      "\n",
-      "We will use `examples/imagenet_deploy.prototxt` and the [caffe_reference_imagenet_model](https://www.dropbox.com/s/n3jups0gr7uj0dv/caffe_reference_imagenet_model).\n",
-      "You'll need to download the model for yourself, and put it in `examples/caffe_reference_imagenet_model`."
+      "We will use `models/imagenet.prototxt` and the caffe_reference_imagenet_model which you can download by `models/get_caffe_reference_imagenet_model.sh`. The learned model should be at `models/caffe_reference_imagenet_model`."
     ]
    },
    {
@ -28,7 +27,7 @@
      "!mkdir _temp\n",
      "!curl http://farm1.static.flickr.com/220/512450093_7717fb8ce8.jpg > _temp/cat.jpg\n",
      "!echo `pwd`/_temp/cat.jpg > _temp/cat.txt\n",
-      "!python ../python/caffe/detection/detector.py --crop_mode=selective_search --pretrained_model=caffe_reference_imagenet_model --model_def=imagenet_deploy.prototxt _temp/cat.txt _temp/cat.h5"
+      "!python ../python/caffe/detection/detector.py --crop_mode=selective_search --pretrained_model=../models/caffe_reference_imagenet_model --model_def=../models/imagenet.prototxt _temp/cat.txt _temp/cat.h5"
     ],
     "language": "python",
     "metadata": {},
@ -329,14 +328,14 @@
      "Refer to `python detector.py --help` and the `images_dim` and `images_mean_file` parameters to describe your data set.\n",
      "No need for hardcoding.\n",
      "\n",
-      "Anyway, let's now load ImageNet class names and make a DataFrame of the features."
+      "Anyway, let's now load ImageNet class names and make a DataFrame of the features. Note you'll need the auxiliary ilsvrc2012 data fetched by `data/ilsvrc12/get_ilsvrc12_aux.sh`."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
-      "with open('../python/caffe/imagenet/ilsvrc_2012_synset_words.txt') as f:\n",
+      "with open('../data/ilsvrc12/synset_words.txt') as f:\n",
      "    labels_df = pd.DataFrame([\n",
      "        {\n",
      "            'synset_id': l.strip().split(' ')[0],\n",
--- a/include/caffe/util/insert_splits.hpp
+++ b/include/caffe/util/insert_splits.hpp
@ -0,0 +1,29 @@
+// Copyright 2014 Jeff Donahue
+
+#ifndef _CAFFE_UTIL_INSERT_SPLITS_HPP_
+#define _CAFFE_UTIL_INSERT_SPLITS_HPP_
+
+#include "caffe/proto/caffe.pb.h"
+
+using std::pair;
+using std::string;
+
+namespace caffe {
+
+// Copy NetParameters with SplitLayers added to replace any shared bottom
+// blobs with unique bottom blobs provided by the SplitLayer.
+void insert_splits(const NetParameter& param, NetParameter* param_split);
+
+void configure_split_layer(const string& layer_name, const string& blob_name,
+    const int blob_idx, const int split_count,
+    LayerConnection* split_layer_connection);
+
+string get_split_layer_name(const string& layer_name, const string& blob_name,
+    const int blob_idx);
+
+string get_split_blob_name(const string& layer_name, const string& blob_name,
+    const int blob_idx, const int split_idx);
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_INSERT_SPLITS_HPP_
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@ -44,6 +44,23 @@ class ReLULayer : public NeuronLayer<Dtype> {
      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
 };

+template <typename Dtype>
+class TanHLayer : public NeuronLayer<Dtype> {
+ public:
+  explicit TanHLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+};

 template <typename Dtype>
 class SigmoidLayer : public NeuronLayer<Dtype> {
@ -108,6 +125,27 @@ class DropoutLayer : public NeuronLayer<Dtype> {
 };


+template <typename Dtype>
+class SplitLayer : public Layer<Dtype> {
+ public:
+  explicit SplitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual Dtype Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom);
+  int count_;
+};
+
+
 template <typename Dtype>
 class FlattenLayer : public Layer<Dtype> {
 public:
--- a/models/.gitignore
+++ b/models/.gitignore
--- a/models/get_caffe_reference_imagenet_model.sh
+++ b/models/get_caffe_reference_imagenet_model.sh
@ -0,0 +1,9 @@
+#!/usr/bin/env sh
+# This scripts downloads the caffe reference imagenet model
+# for ilsvrc image classification and deep feature extraction
+
+echo "Downloading..."
+
+wget -q https://www.dropbox.com/s/n3jups0gr7uj0dv/caffe_reference_imagenet_model
+
+echo "Done. Please check that the checksum = bf44bac4a59aa7792b296962fe483f2b."
--- a/examples/imagenet.prototxt
+++ b/examples/imagenet.prototxt
@ -1,17 +1,9 @@
 name: "CaffeNet"
-layers {
-  layer {
-    name: "data"
-    type: "data"
-    source: "/home/jiayq/Data/ILSVRC12/train-leveldb"
-    meanfile: "/home/jiayq/Data/ILSVRC12/image_mean.binaryproto"
-    batchsize: 256
-    cropsize: 227
-    mirror: true
-  }
-  top: "data"
-  top: "label"
-}
+input: "data"
+input_dim: 10
+input_dim: 3
+input_dim: 227
+input_dim: 227
 layers {
  layer {
    name: "conv1"
@ -356,9 +348,9 @@ layers {
 }
 layers {
  layer {
-    name: "loss"
-    type: "softmax_loss"
+    name: "prob"
+    type: "softmax"
  }
  bottom: "fc8"
-  bottom: "label"
+  top: "prob"
 }
--- a/python/caffe/imagenet/ilsvrc_2012_synset_words.txt
+++ b/python/caffe/imagenet/ilsvrc_2012_synset_words.txt
--- a/python/caffe/imagenet/ilsvrc_2012_synsets.txt
+++ b/python/caffe/imagenet/ilsvrc_2012_synsets.txt
--- a/python/caffe/imagenet/ilsvrc_2012_test.txt
+++ b/python/caffe/imagenet/ilsvrc_2012_test.txt
--- a/python/caffe/imagenet/ilsvrc_2012_train.txt
+++ b/python/caffe/imagenet/ilsvrc_2012_train.txt
--- a/python/caffe/imagenet/ilsvrc_2012_val.txt
+++ b/python/caffe/imagenet/ilsvrc_2012_val.txt
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@ -47,12 +47,16 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
    return new PoolingLayer<Dtype>(param);
  } else if (type == "relu") {
    return new ReLULayer<Dtype>(param);
+  } else if (type == "tanh") {
+    return new TanHLayer<Dtype>(param);
  } else if (type == "sigmoid") {
    return new SigmoidLayer<Dtype>(param);
  } else if (type == "softmax") {
    return new SoftmaxLayer<Dtype>(param);
  } else if (type == "softmax_loss") {
    return new SoftmaxWithLossLayer<Dtype>(param);
+  } else if (type == "split") {
+    return new SplitLayer<Dtype>(param);
  } else if (type == "multinomial_logistic_loss") {
    return new MultinomialLogisticLossLayer<Dtype>(param);
  } else {
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@ -129,6 +129,7 @@ void DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
  leveldb::DB* db_temp;
  leveldb::Options options;
  options.create_if_missing = false;
+  options.max_open_files = 100;
  LOG(INFO) << "Opening leveldb " << this->layer_param_.source();
  leveldb::Status status = leveldb::DB::Open(
      options, this->layer_param_.source(), &db_temp);
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@ -15,7 +15,7 @@ namespace caffe {
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
      vector<Blob<Dtype>*>* top) {
-  CHECK_EQ(bottom.size(), 2) << "SoftmaxLoss Layer takes a single blob as input.";
+  CHECK_EQ(bottom.size(), 2) << "SoftmaxLoss Layer takes two blobs as input.";
  CHECK_EQ(top->size(), 0) << "SoftmaxLoss Layer takes no blob as output.";
  softmax_bottom_vec_.clear();
  softmax_bottom_vec_.push_back(bottom[0]);
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@ -0,0 +1,101 @@
+// Copyright 2014 Jeff Donahue
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SplitLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_EQ(bottom.size(), 1) << "Split Layer takes a single blob as input.";
+  CHECK_GE(top->size(), 1) << "Split Layer takes at least one blob as output.";
+  count_ = bottom[0]->count();
+  for (int i = 0; i < top->size(); ++i) {
+    // Allow the 0th top blob to be 'in-place', but no others.
+    if (i == 0 && (*top)[i] == bottom[0]) {
+      continue;
+    } else {
+      CHECK_NE((*top)[i], bottom[0]) << "Only 0th top blob may be in place.";
+    }
+    (*top)[i]->Reshape(bottom[0]->num(), bottom[0]->channels(),
+                       bottom[0]->height(), bottom[0]->width());
+    CHECK_EQ(count_, (*top)[i]->count());
+  }
+};
+
+template <typename Dtype>
+void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  for (int i = 0; i < top->size(); ++i) {
+    if (i == 0 && (*top)[i] == bottom[0]) {
+      continue;
+    }
+    Dtype* top_data = (*top)[i]->mutable_cpu_data();
+    caffe_copy(count_, bottom_data, top_data);
+  }
+}
+
+template <typename Dtype>
+void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  for (int i = 0; i < top->size(); ++i) {
+    if (i == 0 && (*top)[i] == bottom[0]) {
+      continue;
+    }
+    Dtype* top_data = (*top)[i]->mutable_gpu_data();
+    caffe_gpu_copy(count_, bottom_data, top_data);
+  }
+}
+
+template <typename Dtype>
+Dtype SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    // Initialize by copying first top blob diff to our diff, unless we're
+    // doing in-place computation for the first blob, in which case the diff is
+    // already initialized.
+    if (top[0] != (*bottom)[0]) {
+      caffe_copy(count_, top_diff, bottom_diff);
+    }
+    // Add remaining top blob diffs.
+    for (int i = 1; i < top.size(); ++i) {
+      top_diff = top[i]->cpu_diff();
+      caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+    }
+  }
+  return Dtype(0.);
+}
+
+
+template <typename Dtype>
+Dtype SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+    // Initialize by copying first top blob diff to our diff, unless we're
+    // doing in-place computation for the first blob, in which case the diff is
+    // already initialized.
+    if (top[0] != (*bottom)[0]) {
+      caffe_gpu_copy(count_, top_diff, bottom_diff);
+    }
+    // Add remaining top blob diffs.
+    for (int i = 1; i < top.size(); ++i) {
+      top_diff = top[i]->gpu_diff();
+      caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+    }
+  }
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(SplitLayer);
+
+}  // namespace caffe
--- a/src/caffe/layers/tanh_layer.cu
+++ b/src/caffe/layers/tanh_layer.cu
@ -0,0 +1,97 @@
+// Copyright 2014 Aravindh Mahendran
+// TanH neuron activation function layer. Adapted from ReLU layer code written by Yangqing Jia
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include <algorithm>
+
+namespace caffe {
+
+template <typename Dtype>
+void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype exp2x;
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    exp2x = exp(2*bottom_data[i]);
+    top_data[i] = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+  }
+}
+
+template <typename Dtype>
+Dtype TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int count = (*bottom)[0]->count();
+    Dtype exp2x;
+    Dtype tanhx;
+    for (int i = 0; i < count; ++i) {
+      exp2x = exp(2*bottom_data[i]);
+      tanhx = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+      bottom_diff[i] = top_diff[i] * (1 - tanhx*tanhx);
+    }
+  }
+  return Dtype(0);
+}
+
+template <typename Dtype>
+__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    Dtype exp2x = exp(2*in[index]);
+    out[index] = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+  }
+}
+
+template <typename Dtype>
+void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  TanHForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, top_data);
+  CUDA_POST_KERNEL_CHECK;
+  // << " count: " << count << " bottom_data: "
+  //     << (unsigned long)bottom_data << " top_data: " << (unsigned long)top_data
+  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
+  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+}
+
+template <typename Dtype>
+__global__ void TanHBackward(const int n, const Dtype* in_diff,
+    const Dtype* in_data, Dtype* out_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    Dtype exp2x = exp(2*in_data[index]);
+    Dtype tanhx = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+    out_diff[index] = in_diff[index] * (1 - tanhx*tanhx);
+  }
+}
+
+template <typename Dtype>
+Dtype TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+    const int count = (*bottom)[0]->count();
+    TanHBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, bottom_data, bottom_diff);
+    CUDA_POST_KERNEL_CHECK;
+  }
+  return Dtype(0);
+}
+
+INSTANTIATE_CLASS(TanHLayer);
+
+
+}  // namespace caffe
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@ -9,6 +9,7 @@
 #include "caffe/layer.hpp"
 #include "caffe/net.hpp"
 #include "caffe/util/io.hpp"
+#include "caffe/util/insert_splits.hpp"

 using std::pair;
 using std::map;
@ -29,7 +30,10 @@ Net<Dtype>::Net(const string& param_file) {
 }

 template <typename Dtype>
-void Net<Dtype>::Init(const NetParameter& param) {
+void Net<Dtype>::Init(const NetParameter& in_param) {
+  // Create a copy of in_param with splits added where necessary.
+  NetParameter param;
+  insert_splits(in_param, &param);
  // Basically, build all the layers and set up its connections.
  name_ = param.name();
  map<string, int> blob_name_to_idx;
--- a/src/caffe/test/test_split_layer.cpp
+++ b/src/caffe/test/test_split_layer.cpp
--- a/src/caffe/test/test_tanh_layer.cpp
+++ b/src/caffe/test/test_tanh_layer.cpp
@ -0,0 +1,102 @@
+// Copyright 2014 Aravindh Mahendran
+// Adapted from other test files 
+
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime.h>
+
+#include "gtest/gtest.h"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+
+template <typename Dtype>
+class TanHLayerTest : public ::testing::Test {
+ protected:
+  TanHLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 10, 1, 1)),
+        blob_top_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  };
+  virtual ~TanHLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<float, double> Dtypes;
+TYPED_TEST_CASE(TanHLayerTest, Dtypes);
+
+TYPED_TEST(TanHLayerTest, TestForwardCPU) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
+  TanHLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  // Test exact values
+  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
+    for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
+      for (int k = 0; k < this->blob_bottom_->height(); ++k) {
+        for (int l = 0; l < this->blob_bottom_->width(); ++l) {
+          EXPECT_GE(this->blob_top_->data_at(i,j,k,l) + 1e-4,
+             (exp(2*this->blob_bottom_->data_at(i,j,k,l))-1)/(exp(2*this->blob_bottom_->data_at(i,j,k,l))+1));
+          EXPECT_LE(this->blob_top_->data_at(i,j,k,l) - 1e-4,
+             (exp(2*this->blob_bottom_->data_at(i,j,k,l))-1)/(exp(2*this->blob_bottom_->data_at(i,j,k,l))+1));
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(TanHLayerTest, TestGradientCPU) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
+  TanHLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+TYPED_TEST(TanHLayerTest, TestForwardGPU) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::GPU);
+  TanHLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  // Test exact values
+  for (int i = 0; i < this->blob_bottom_->num(); ++i) {
+    for (int j = 0; j < this->blob_bottom_->channels(); ++j) {
+      for (int k = 0; k < this->blob_bottom_->height(); ++k) {
+        for (int l = 0; l < this->blob_bottom_->width(); ++l) {
+          EXPECT_GE(this->blob_top_->data_at(i,j,k,l) + 1e-4,
+             (exp(2*this->blob_bottom_->data_at(i,j,k,l))-1)/(exp(2*this->blob_bottom_->data_at(i,j,k,l))+1));
+          EXPECT_LE(this->blob_top_->data_at(i,j,k,l) - 1e-4,
+             (exp(2*this->blob_bottom_->data_at(i,j,k,l))-1)/(exp(2*this->blob_bottom_->data_at(i,j,k,l))+1));
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(TanHLayerTest, TestGradientGPU) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::GPU);
+  TanHLayer<TypeParam> layer(layer_param);
+  GradientChecker<TypeParam> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(layer, this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+}
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@ -0,0 +1,129 @@
+// Copyright 2014 Jeff Donahue
+
+#include <map>
+#include <string>
+#include <sstream>
+
+#include "caffe/common.hpp"
+#include "caffe/util/insert_splits.hpp"
+
+using std::map;
+using std::ostringstream;
+using std::pair;
+using std::make_pair;
+
+namespace caffe {
+
+void insert_splits(const NetParameter& param, NetParameter* param_split) {
+  // Initialize by copying from the input NetParameter.
+  param_split->CopyFrom(param);
+  param_split->clear_layers();
+  map<string, pair<int, int> > blob_name_to_last_top_idx;
+  map<pair<int, int>, pair<int, int> > bottom_idx_to_source_top_idx;
+  map<pair<int, int>, int> top_idx_to_bottom_count;
+  map<pair<int, int>, int> top_idx_to_bottom_split_idx;
+  map<int, string> layer_idx_to_layer_name;
+  layer_idx_to_layer_name[-1] = "input";
+  // Determine the number of times each blob is used as an input (bottom) blob.
+  for (int i = 0; i < param.input_size(); ++i) {
+    const string& blob_name = param.input(i);
+    blob_name_to_last_top_idx[blob_name] = make_pair(-1, i);
+  }
+  for (int i = 0; i < param.layers_size(); ++i) {
+    const LayerConnection& layer_connection = param.layers(i);
+    layer_idx_to_layer_name[i] = layer_connection.layer().name();
+    for (int j = 0; j < layer_connection.bottom_size(); ++j) {
+      const string& blob_name = layer_connection.bottom(j);
+      if (blob_name_to_last_top_idx.find(blob_name) ==
+          blob_name_to_last_top_idx.end()) {
+        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+      }
+      const pair<int, int>& bottom_idx = make_pair(i, j);
+      const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
+      bottom_idx_to_source_top_idx[bottom_idx] = top_idx;
+      ++top_idx_to_bottom_count[top_idx];
+    }
+    for (int j = 0; j < layer_connection.top_size(); ++j) {
+      const string& blob_name = layer_connection.top(j);
+      blob_name_to_last_top_idx[blob_name] = make_pair(i, j);
+    }
+  }
+  // Create split layer for any input blobs used by other layers as bottom
+  // blobs more than once.
+  for (int i = 0; i < param.input_size(); ++i) {
+    const int split_count = top_idx_to_bottom_count[make_pair(-1, i)];
+    if (split_count > 1) {
+      const string& layer_name = layer_idx_to_layer_name[-1];
+      const string& blob_name = param.input(i);
+      LayerConnection* split_layer_connection = param_split->add_layers();
+      configure_split_layer(layer_name, blob_name, i, split_count,
+          split_layer_connection);
+    }
+  }
+  for (int i = 0; i < param.layers_size(); ++i) {
+    LayerConnection* layer_connection = param_split->add_layers();
+    layer_connection->CopyFrom(param.layers(i));
+    // Replace any shared bottom blobs with split layer outputs.
+    for (int j = 0; j < layer_connection->bottom_size(); ++j) {
+      const pair<int, int>& top_idx =
+          bottom_idx_to_source_top_idx[make_pair(i, j)];
+      const int split_count = top_idx_to_bottom_count[top_idx];
+      if (split_count > 1) {
+        const string& layer_name = layer_idx_to_layer_name[top_idx.first];
+        const string& blob_name = layer_connection->bottom(j);
+        layer_connection->set_bottom(j, get_split_blob_name(layer_name,
+            blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
+      }
+    }
+    // Create split layer for any top blobs used by other layers as bottom
+    // blobs more than once.
+    for (int j = 0; j < layer_connection->top_size(); ++j) {
+      const int split_count = top_idx_to_bottom_count[make_pair(i, j)];
+      if (split_count > 1) {
+        const string& layer_name = layer_idx_to_layer_name[i];
+        const string& blob_name = layer_connection->top(j);
+        LayerConnection* split_layer_connection = param_split->add_layers();
+        configure_split_layer(layer_name, blob_name, j, split_count,
+            split_layer_connection);
+      }
+    }
+  }
+}
+
+void configure_split_layer(const string& layer_name, const string& blob_name,
+    const int blob_idx, const int split_count,
+    LayerConnection* split_layer_connection) {
+  split_layer_connection->Clear();
+  split_layer_connection->add_bottom(blob_name);
+  LayerParameter* split_layer_param = split_layer_connection->mutable_layer();
+  split_layer_param->set_name(
+      get_split_layer_name(layer_name, blob_name, blob_idx));
+  split_layer_param->set_type("split");
+  for (int k = 0; k < split_count; ++k) {
+    split_layer_connection->add_top(
+        get_split_blob_name(layer_name, blob_name, blob_idx, k));
+  }
+}
+
+string get_split_layer_name(const string& layer_name, const string& blob_name,
+    const int blob_idx) {
+  ostringstream split_layer_name;
+  split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
+      << "_split";
+  return split_layer_name.str();
+}
+
+string get_split_blob_name(const string& layer_name, const string& blob_name,
+    const int blob_idx, const int split_idx) {
+  // 0th split top blob is given the same name as the bottom blob so that
+  // computation is done 'in-place', saving a bit of time and memory.
+  if (split_idx == 0) {
+    return blob_name;
+  }
+  ostringstream split_blob_name;
+  split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
+      << "_split_" << split_idx;
+  return split_blob_name.str();
+}
+
+}  // namespace caffe
--- a/examples/demo_compute_image_mean.cpp
+++ b/examples/demo_compute_image_mean.cpp
@ -14,7 +14,7 @@ using caffe::BlobProto;
 int main(int argc, char** argv) {
  ::google::InitGoogleLogging(argv[0]);
  if (argc != 3) {
-    LOG(ERROR) << "Usage: demo_compute_image_mean input_leveldb output_file";
+    LOG(ERROR) << "Usage: compute_image_mean input_leveldb output_file";
    return(0);
  }

--- a/examples/convert_imageset.cpp
+++ b/examples/convert_imageset.cpp
--- a/examples/device_query.cpp
+++ b/examples/device_query.cpp
--- a/examples/dump_network.cpp
+++ b/examples/dump_network.cpp
--- a/tools/extra/extract_seconds.py
+++ b/tools/extra/extract_seconds.py
--- a/tools/extra/launch_resize_and_crop_images.sh
+++ b/tools/extra/launch_resize_and_crop_images.sh
--- a/tools/extra/parselog.sh
+++ b/tools/extra/parselog.sh
--- a/tools/extra/plot_log.gnuplot.example
+++ b/tools/extra/plot_log.gnuplot.example
--- a/tools/extra/plot_training_log.py.example
+++ b/tools/extra/plot_training_log.py.example
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
--- a/examples/finetune_net.cpp
+++ b/examples/finetune_net.cpp
--- a/examples/net_speed_benchmark.cpp
+++ b/examples/net_speed_benchmark.cpp
--- a/examples/test_net.cpp
+++ b/examples/test_net.cpp
--- a/examples/train_net.cpp
+++ b/examples/train_net.cpp