tranpose parameter added to IP layer to support tied weights in an autoencoder. Arguments to matrix multiplication function are conditioned on this parameter, no actual transposing takes place.

test ip gradient computation with transpose on
2016-01-29 19:21:48 +01:00 · 2016-01-29 19:21:48 +01:00 · 8f847fa8fa
--- a/include/caffe/layers/inner_product_layer.hpp
+++ b/include/caffe/layers/inner_product_layer.hpp
@ -44,6 +44,7 @@ class InnerProductLayer : public Layer<Dtype> {
  int N_;
  bool bias_term_;
  Blob<Dtype> bias_multiplier_;
+  bool transpose_;  ///< if true, assume transposed weights
 };

 }  // namespace caffe
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@ -11,6 +11,7 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int num_output = this->layer_param_.inner_product_param().num_output();
  bias_term_ = this->layer_param_.inner_product_param().bias_term();
+  transpose_ = this->layer_param_.inner_product_param().transpose();
  N_ = num_output;
  const int axis = bottom[0]->CanonicalAxisIndex(
      this->layer_param_.inner_product_param().axis());
@ -27,10 +28,15 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
    } else {
      this->blobs_.resize(1);
    }
-    // Intialize the weight
+    // Initialize the weights
    vector<int> weight_shape(2);
-    weight_shape[0] = N_;
-    weight_shape[1] = K_;
+    if (transpose_) {
+      weight_shape[0] = K_;
+      weight_shape[1] = N_;
+    } else {
+      weight_shape[0] = N_;
+      weight_shape[1] = K_;
+    }
    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
    // fill the weights
    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
@ -80,7 +86,8 @@ void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  const Dtype* weight = this->blobs_[0]->cpu_data();
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
+  caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,
+      M_, N_, K_, (Dtype)1.,
      bottom_data, weight, (Dtype)0., top_data);
  if (bias_term_) {
    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
@ -97,8 +104,17 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const Dtype* top_diff = top[0]->cpu_diff();
    const Dtype* bottom_data = bottom[0]->cpu_data();
    // Gradient with respect to weight
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
+    if (transpose_) {
+      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
+          K_, N_, M_,
+          (Dtype)1., bottom_data, top_diff,
+          (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
+    } else {
+      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
+          N_, K_, M_,
+          (Dtype)1., top_diff, bottom_data,
+          (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
+    }
  }
  if (bias_term_ && this->param_propagate_down_[1]) {
    const Dtype* top_diff = top[0]->cpu_diff();
@ -110,9 +126,17 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
  if (propagate_down[0]) {
    const Dtype* top_diff = top[0]->cpu_diff();
    // Gradient with respect to bottom data
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
-        bottom[0]->mutable_cpu_diff());
+    if (transpose_) {
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans,
+          M_, K_, N_,
+          (Dtype)1., top_diff, this->blobs_[0]->cpu_data(),
+          (Dtype)0., bottom[0]->mutable_cpu_diff());
+    } else {
+      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
+          M_, K_, N_,
+          (Dtype)1., top_diff, this->blobs_[0]->cpu_data(),
+          (Dtype)0., bottom[0]->mutable_cpu_diff());
+    }
  }
 }

--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@ -19,7 +19,9 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      caffe_gpu_axpy<Dtype>(N_, bias_multiplier_.cpu_data()[0],
                            this->blobs_[1]->gpu_data(), top_data);
  } else {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
+    caffe_gpu_gemm<Dtype>(CblasNoTrans,
+                          transpose_ ? CblasNoTrans : CblasTrans,
+                          M_, N_, K_, (Dtype)1.,
                          bottom_data, weight, (Dtype)0., top_data);
    if (bias_term_)
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
@ -36,8 +38,17 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const Dtype* top_diff = top[0]->gpu_diff();
    const Dtype* bottom_data = bottom[0]->gpu_data();
    // Gradient with respect to weight
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
+    if (transpose_) {
+      caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
+          K_, N_, M_,
+          (Dtype)1., bottom_data, top_diff,
+          (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
+    } else {
+      caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,
+          N_, K_, M_,
+          (Dtype)1., top_diff, bottom_data,
+          (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
+    }
  }
  if (bias_term_ && this->param_propagate_down_[1]) {
    const Dtype* top_diff = top[0]->gpu_diff();
@ -49,9 +60,17 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  if (propagate_down[0]) {
    const Dtype* top_diff = top[0]->gpu_diff();
    // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
-        bottom[0]->mutable_gpu_diff());
+    if (transpose_) {
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans,
+          M_, K_, N_,
+          (Dtype)1., top_diff, this->blobs_[0]->gpu_data(),
+          (Dtype)0., bottom[0]->mutable_gpu_diff());
+    } else {
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,
+          M_, K_, N_,
+         (Dtype)1., top_diff, this->blobs_[0]->gpu_data(),
+         (Dtype)0., bottom[0]->mutable_gpu_diff());
+    }
  }
 }

--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@ -786,6 +786,11 @@ message InnerProductParameter {
  // all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
 }

 // Message that stores parameters used by LogLayer
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@ -60,6 +60,50 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
  EXPECT_EQ(this->blob_top_->channels(), 10);
 }

+/** @brief TestSetUp while toggling tranpose flag
+ */
+TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  LayerParameter layer_param;
+  InnerProductParameter* inner_product_param =
+      layer_param.mutable_inner_product_param();
+  inner_product_param->set_num_output(10);
+  inner_product_param->set_transpose(false);
+  shared_ptr<InnerProductLayer<Dtype> > layer(
+      new InnerProductLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(2, this->blob_top_->num());
+  EXPECT_EQ(1, this->blob_top_->height());
+  EXPECT_EQ(1, this->blob_top_->width());
+  EXPECT_EQ(10, this->blob_top_->channels());
+  EXPECT_EQ(2, layer->blobs()[0]->num_axes());
+  EXPECT_EQ(10, layer->blobs()[0]->shape(0));
+  EXPECT_EQ(60, layer->blobs()[0]->shape(1));
+}
+
+/** @brief TestSetUp while toggling tranpose flag
+ */
+TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  LayerParameter layer_param;
+  InnerProductParameter* inner_product_param =
+      layer_param.mutable_inner_product_param();
+  inner_product_param->set_num_output(10);
+  inner_product_param->set_transpose(true);
+  shared_ptr<InnerProductLayer<Dtype> > layer(
+      new InnerProductLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(2, this->blob_top_->num());
+  EXPECT_EQ(1, this->blob_top_->height());
+  EXPECT_EQ(1, this->blob_top_->width());
+  EXPECT_EQ(10, this->blob_top_->channels());
+  EXPECT_EQ(2, layer->blobs()[0]->num_axes());
+  EXPECT_EQ(60, layer->blobs()[0]->shape(0));
+  EXPECT_EQ(10, layer->blobs()[0]->shape(1));
+}
+
 TYPED_TEST(InnerProductLayerTest, TestForward) {
  typedef typename TypeParam::Dtype Dtype;
  this->blob_bottom_vec_.push_back(this->blob_bottom_);
@ -91,6 +135,79 @@ TYPED_TEST(InnerProductLayerTest, TestForward) {
  }
 }

+/**
+ * @brief Init. an IP layer without transpose + random weights,
+ * run Forward, save the result.
+ * Init. another IP layer with transpose.
+ * manually copy and transpose the weights from the first IP layer,
+ * then run Forward on the same input and check that the result is the same
+ */
+TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+  if (Caffe::mode() == Caffe::CPU ||
+      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+    LayerParameter layer_param;
+    InnerProductParameter* inner_product_param =
+        layer_param.mutable_inner_product_param();
+    inner_product_param->set_num_output(10);
+    inner_product_param->mutable_weight_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_min(1);
+    inner_product_param->mutable_bias_filler()->set_max(2);
+    inner_product_param->set_transpose(false);
+    shared_ptr<InnerProductLayer<Dtype> > layer(
+        new InnerProductLayer<Dtype>(layer_param));
+    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    const int count = this->blob_top_->count();
+    Blob<Dtype>* const top = new Blob<Dtype>();
+    top->ReshapeLike(*this->blob_top_);
+    caffe_copy(count, this->blob_top_->cpu_data(), top->mutable_cpu_data());
+    this->blob_top_vec_.clear();
+    this->blob_top_vec_.push_back(new Blob<Dtype>());
+    inner_product_param->set_transpose(true);
+    shared_ptr<InnerProductLayer<Dtype> > ip_t(
+        new InnerProductLayer<Dtype>(layer_param));
+    ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    const int count_w = layer->blobs()[0]->count();
+    EXPECT_EQ(count_w, ip_t->blobs()[0]->count());
+    // manually copy and transpose the weights from 1st IP layer into 2nd
+    const Dtype* w = layer->blobs()[0]->cpu_data();
+    Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data();
+    const int width = layer->blobs()[0]->shape(1);
+    const int width_t = ip_t->blobs()[0]->shape(1);
+    for (int i = 0; i < count_w; ++i) {
+      int r = i / width;
+      int c = i % width;
+      w_t[c*width_t+r] = w[r*width+c];  // copy while transposing
+    }
+    // copy bias from 1st IP layer to 2nd IP layer
+    ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count());
+    caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(),
+        ip_t->blobs()[1]->mutable_cpu_data());
+    ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    EXPECT_EQ(count, this->blob_top_->count())
+        << "Invalid count for top blob for IP with transpose.";
+    Blob<Dtype>* const top_t = new Blob<Dtype>();\
+    top_t->ReshapeLike(*this->blob_top_vec_[0]);
+    caffe_copy(count,
+      this->blob_top_vec_[0]->cpu_data(),
+      top_t->mutable_cpu_data());
+    const Dtype* data = top->cpu_data();
+    const Dtype* data_t = top_t->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_FLOAT_EQ(data[i], data_t[i]);
+    }
+  } else {
+    LOG(ERROR) << "Skipping test due to old architecture.";
+  }
+}
+
 TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) {
  typedef typename TypeParam::Dtype Dtype;
  this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_);
@ -148,4 +265,127 @@ TYPED_TEST(InnerProductLayerTest, TestGradient) {
  }
 }

+TYPED_TEST(InnerProductLayerTest, TestGradientTranspose) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+  if (Caffe::mode() == Caffe::CPU ||
+      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+    LayerParameter layer_param;
+    InnerProductParameter* inner_product_param =
+        layer_param.mutable_inner_product_param();
+    inner_product_param->set_num_output(11);
+    inner_product_param->mutable_weight_filler()->set_type("gaussian");
+    inner_product_param->mutable_bias_filler()->set_type("gaussian");
+    inner_product_param->mutable_bias_filler()->set_min(1);
+    inner_product_param->mutable_bias_filler()->set_max(2);
+    inner_product_param->set_transpose(true);
+    InnerProductLayer<Dtype> layer(layer_param);
+    GradientChecker<Dtype> checker(1e-2, 1e-3);
+    checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+        this->blob_top_vec_);
+  } else {
+    LOG(ERROR) << "Skipping test due to old architecture.";
+  }
+}
+
+TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+  if (Caffe::mode() == Caffe::CPU ||
+      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+    LayerParameter layer_param;
+    InnerProductParameter* inner_product_param =
+        layer_param.mutable_inner_product_param();
+    inner_product_param->set_num_output(10);
+    inner_product_param->mutable_weight_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_min(1);
+    inner_product_param->mutable_bias_filler()->set_max(2);
+    inner_product_param->set_transpose(false);
+    shared_ptr<InnerProductLayer<Dtype> > layer(
+        new InnerProductLayer<Dtype>(layer_param));
+    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    // copy top blob
+    Blob<Dtype>* const top = new Blob<Dtype>();
+    top->CopyFrom(*this->blob_top_, false, true);
+    // fake top diff
+    Blob<Dtype>* const diff = new Blob<Dtype>();
+    diff->ReshapeLike(*this->blob_top_);
+    {
+      FillerParameter filler_param;
+      UniformFiller<Dtype> filler(filler_param);
+      filler.Fill(diff);
+    }
+    caffe_copy(this->blob_top_vec_[0]->count(),
+      diff->cpu_data(),
+      this->blob_top_vec_[0]->mutable_cpu_diff());
+    vector<bool> propagate_down(1, true);
+    layer->Backward(this->blob_top_vec_,
+        propagate_down,
+        this->blob_bottom_vec_);
+    // copy first ip's weights and their diffs
+    Blob<Dtype>* const w = new Blob<Dtype>();
+    w->CopyFrom(*layer->blobs()[0], false, true);
+    w->CopyFrom(*layer->blobs()[0], true, true);
+    // copy bottom diffs
+    Blob<Dtype>* const bottom_diff = new Blob<Dtype>();
+    bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true);
+    // repeat original top with tranposed ip
+    this->blob_top_vec_.clear();
+    this->blob_top_vec_.push_back(new Blob<Dtype>());
+    inner_product_param->set_transpose(true);
+    shared_ptr<InnerProductLayer<Dtype> > ip_t(
+        new InnerProductLayer<Dtype>(layer_param));
+    ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    // manually copy and transpose the weights from 1st IP layer into 2nd
+    {
+      const Dtype* w_src = w->cpu_data();
+      Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data();
+      const int width = layer->blobs()[0]->shape(1);
+      const int width_t = ip_t->blobs()[0]->shape(1);
+      for (int i = 0; i < layer->blobs()[0]->count(); ++i) {
+        int r = i / width;
+        int c = i % width;
+        w_t[c*width_t+r] = w_src[r*width+c];  // copy while transposing
+      }
+      // copy bias from 1st IP layer to 2nd IP layer
+      ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count());
+      caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(),
+          ip_t->blobs()[1]->mutable_cpu_data());
+    }
+    ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    caffe_copy(this->blob_top_vec_[0]->count(),
+      diff->cpu_data(),
+      this->blob_top_vec_[0]->mutable_cpu_diff());
+    ip_t->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+    const Dtype* data = w->cpu_diff();
+    const Dtype* data_t = ip_t->blobs()[0]->cpu_diff();
+    const int WIDTH = layer->blobs()[0]->shape(1);
+    const int WIDTH_T = ip_t->blobs()[0]->shape(1);
+    for (int i = 0; i < layer->blobs()[0]->count(); ++i) {
+      int r = i / WIDTH;
+      int c = i % WIDTH;
+      EXPECT_NE(Dtype(0.), data[r*WIDTH+c]);
+      EXPECT_FLOAT_EQ(data[r*WIDTH+c], data_t[c*WIDTH_T+r]);
+    }
+    data = bottom_diff->cpu_diff();
+    data_t = this->blob_bottom_vec_[0]->cpu_diff();
+    for (int i = 0; i < this->blob_bottom_vec_[0]->count(); ++i) {
+      EXPECT_NE(Dtype(0.), data[i]);
+      EXPECT_FLOAT_EQ(data[i], data_t[i]);
+    }
+  } else {
+    LOG(ERROR) << "Skipping test due to old architecture.";
+  }
+}
+
 }  // namespace caffe