From b17ac6620b4e6ae33d4d889b6cdbde1c447bb944 Mon Sep 17 00:00:00 2001
From: Eric Tzeng <eric.s.tzeng@gmail.com>
Date: Wed, 26 Feb 2014 18:45:45 -0800
Subject: [PATCH] Splitting source files between CUDA and CPU code.

---
 src/caffe/layers/bnll_layer.cpp                    |  48 ++++++++++
 src/caffe/layers/bnll_layer.cu                     |  31 ------
 src/caffe/layers/conv_layer.cpp                    |  88 -----------------
 src/caffe/layers/conv_layer.cu                     | 104 +++++++++++++++++++++
 src/caffe/layers/data_layer.cpp                    |  23 -----
 src/caffe/layers/data_layer.cu                     |  44 +++++++++
 src/caffe/layers/dropout_layer.cpp                 |  63 +++++++++++++
 src/caffe/layers/dropout_layer.cu                  |  48 ----------
 src/caffe/layers/flatten_layer.cpp                 |  18 ----
 src/caffe/layers/flatten_layer.cu                  |  30 ++++++
 src/caffe/layers/hdf5_data_layer.cpp               |  28 ------
 src/caffe/layers/hdf5_data_layer.cu                |  53 +++++++++++
 src/caffe/layers/im2col_layer.cpp                  |  24 -----
 src/caffe/layers/im2col_layer.cu                   |  38 ++++++++
 src/caffe/layers/inner_product_layer.cpp           |  40 --------
 src/caffe/layers/inner_product_layer.cu            |  59 ++++++++++++
 src/caffe/layers/{loss_layer.cu => loss_layer.cpp} |   6 +-
 src/caffe/layers/relu_layer.cpp                    |  42 +++++++++
 src/caffe/layers/relu_layer.cu                     |  27 ------
 src/caffe/layers/sigmoid_layer.cpp                 |  46 +++++++++
 src/caffe/layers/sigmoid_layer.cu                  |  35 -------
 src/caffe/layers/softmax_layer.cpp                 |  86 +++++++++++++++++
 src/caffe/layers/softmax_layer.cu                  |  71 --------------
 src/caffe/layers/softmax_loss_layer.cpp            |  59 ++++++++++++
 src/caffe/layers/softmax_loss_layer.cu             |  40 --------
 src/caffe/layers/split_layer.cpp                   |  34 -------
 src/caffe/layers/split_layer.cu                    |  48 ++++++++++
 src/caffe/layers/tanh_layer.cpp                    |  48 ++++++++++
 src/caffe/layers/tanh_layer.cu                     |  33 -------
 29 files changed, 771 insertions(+), 543 deletions(-)
 create mode 100644 src/caffe/layers/bnll_layer.cpp
 create mode 100644 src/caffe/layers/conv_layer.cu
 create mode 100644 src/caffe/layers/data_layer.cu
 create mode 100644 src/caffe/layers/dropout_layer.cpp
 create mode 100644 src/caffe/layers/flatten_layer.cu
 create mode 100644 src/caffe/layers/hdf5_data_layer.cu
 create mode 100644 src/caffe/layers/im2col_layer.cu
 create mode 100644 src/caffe/layers/inner_product_layer.cu
 rename src/caffe/layers/{loss_layer.cu => loss_layer.cpp} (96%)
 create mode 100644 src/caffe/layers/relu_layer.cpp
 create mode 100644 src/caffe/layers/sigmoid_layer.cpp
 create mode 100644 src/caffe/layers/softmax_layer.cpp
 create mode 100644 src/caffe/layers/softmax_loss_layer.cpp
 create mode 100644 src/caffe/layers/split_layer.cu
 create mode 100644 src/caffe/layers/tanh_layer.cpp
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
new file mode 100644
index 0000000..ab0e0f0
--- /dev/null
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -0,0 +1,48 @@
+// Copyright 2013 Yangqing Jia
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include <algorithm>
+
+using std::min;
+
+namespace caffe {
+
+const float kBNLL_THRESHOLD = 50.;
+
+template <typename Dtype>
+void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = bottom_data[i] > 0 ?
+        bottom_data[i] + log(1. + exp(-bottom_data[i])) :
+        log(1. + exp(bottom_data[i]));
+  }
+}
+
+template <typename Dtype>
+Dtype BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int count = (*bottom)[0]->count();
+    Dtype expval;
+    for (int i = 0; i < count; ++i) {
+      expval = exp(min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
+      bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
+    }
+  }
+  return Dtype(0);
+}
+
+
+INSTANTIATE_CLASS(BNLLLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu
index f61cffa..1edec33 100644
--- a/src/caffe/layers/bnll_layer.cu
+++ b/src/caffe/layers/bnll_layer.cu
@@ -13,37 +13,6 @@ namespace caffe {
 const float kBNLL_THRESHOLD = 50.;
 
 template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = bottom_data[i] > 0 ?
-        bottom_data[i] + log(1. + exp(-bottom_data[i])) :
-        log(1. + exp(bottom_data[i]));
-  }
-}
-
-template <typename Dtype>
-Dtype BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const int count = (*bottom)[0]->count();
-    Dtype expval;
-    for (int i = 0; i < count; ++i) {
-      expval = exp(min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
-      bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
-    }
-  }
-  return Dtype(0);
-}
-
-template <typename Dtype>
 __global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n) {
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 21256f9..64a652a 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -107,36 +107,6 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  int weight_offset = M_ * K_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  for (int n = 0; n < NUM_; ++n) {
-    // First, im2col
-    im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
-                      WIDTH_, KSIZE_, PAD_, STRIDE_, col_data);
-    // Second, innerproduct with groups
-    for (int g = 0; g < GROUP_; ++g) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-        (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
-        (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
-    }
-    // third, add bias
-    if (biasterm_) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, NUM_OUTPUT_,
-          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-          (Dtype)1., top_data + (*top)[0]->offset(n));
-    }
-  }
-}
-
-template <typename Dtype>
 Dtype ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -192,64 +162,6 @@ Dtype ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0.);
 }
 
-template <typename Dtype>
-Dtype ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
-  // bias gradient if necessary
-  Dtype* bias_diff = NULL;
-
-  if (biasterm_) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    CUDA_CHECK(cudaMemset(bias_diff, 0,
-        sizeof(Dtype) * this->blobs_[1]->count()));
-    for (int n = 0; n < NUM_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, NUM_OUTPUT_, N_,
-          1., top_diff + top[0]->offset(n),
-          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-          1., bias_diff);
-    }
-  }
-
-  int weight_offset = M_ * K_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  CUDA_CHECK(cudaMemset(weight_diff, 0,
-      sizeof(Dtype) * this->blobs_[0]->count()));
-  for (int n = 0; n < NUM_; ++n) {
-    // since we saved memory in the forward pass by not storing all col data,
-    // we will need to recompute them.
-    im2col_gpu(bottom_data + (*bottom)[0]->offset(n), CHANNELS_, HEIGHT_,
-                      WIDTH_, KSIZE_, PAD_, STRIDE_, col_data);
-    // gradient w.r.t. weight. Note that we will accumulate diffs.
-    for (int g = 0; g < GROUP_; ++g) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
-        (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g,
-        col_data + col_offset * g, (Dtype)1.,
-        weight_diff + weight_offset * g);
-    }
-    // gradient w.r.t. bottom data, if necessary
-    if (propagate_down) {
-      for (int g = 0; g < GROUP_; ++g) {
-        caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
-          (Dtype)1., weight + weight_offset * g,
-          top_diff + top[0]->offset(n) + top_offset * g,
-          (Dtype)0., col_diff + col_offset * g);
-      }
-      // col2im back to the data
-      col2im_gpu(col_diff, CHANNELS_, HEIGHT_, WIDTH_, KSIZE_, PAD_, STRIDE_,
-          bottom_diff + (*bottom)[0]->offset(n));
-    }
-  }
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(ConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
new file mode 100644
index 0000000..a7f56fa
--- /dev/null
+++ b/src/caffe/layers/conv_layer.cu
@@ -0,0 +1,104 @@
+// Copyright 2013 Yangqing Jia
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  Dtype* col_data = col_buffer_.mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  int weight_offset = M_ * K_;
+  int col_offset = K_ * N_;
+  int top_offset = M_ * N_;
+  for (int n = 0; n < NUM_; ++n) {
+    // First, im2col
+    im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
+                      WIDTH_, KSIZE_, PAD_, STRIDE_, col_data);
+    // Second, innerproduct with groups
+    for (int g = 0; g < GROUP_; ++g) {
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
+        (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
+        (Dtype)0., top_data + (*top)[0]->offset(n) + top_offset * g);
+    }
+    // third, add bias
+    if (biasterm_) {
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, NUM_OUTPUT_,
+          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(),
+          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
+          (Dtype)1., top_data + (*top)[0]->offset(n));
+    }
+  }
+}
+
+template <typename Dtype>
+Dtype ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+  Dtype* col_data = col_buffer_.mutable_gpu_data();
+  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
+  // bias gradient if necessary
+  Dtype* bias_diff = NULL;
+
+  if (biasterm_) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    CUDA_CHECK(cudaMemset(bias_diff, 0,
+        sizeof(Dtype) * this->blobs_[1]->count()));
+    for (int n = 0; n < NUM_; ++n) {
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, NUM_OUTPUT_, N_,
+          1., top_diff + top[0]->offset(n),
+          reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
+          1., bias_diff);
+    }
+  }
+
+  int weight_offset = M_ * K_;
+  int col_offset = K_ * N_;
+  int top_offset = M_ * N_;
+  CUDA_CHECK(cudaMemset(weight_diff, 0,
+      sizeof(Dtype) * this->blobs_[0]->count()));
+  for (int n = 0; n < NUM_; ++n) {
+    // since we saved memory in the forward pass by not storing all col data,
+    // we will need to recompute them.
+    im2col_gpu(bottom_data + (*bottom)[0]->offset(n), CHANNELS_, HEIGHT_,
+                      WIDTH_, KSIZE_, PAD_, STRIDE_, col_data);
+    // gradient w.r.t. weight. Note that we will accumulate diffs.
+    for (int g = 0; g < GROUP_; ++g) {
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
+        (Dtype)1., top_diff + top[0]->offset(n) + top_offset * g,
+        col_data + col_offset * g, (Dtype)1.,
+        weight_diff + weight_offset * g);
+    }
+    // gradient w.r.t. bottom data, if necessary
+    if (propagate_down) {
+      for (int g = 0; g < GROUP_; ++g) {
+        caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
+          (Dtype)1., weight + weight_offset * g,
+          top_diff + top[0]->offset(n) + top_offset * g,
+          (Dtype)0., col_diff + col_offset * g);
+      }
+      // col2im back to the data
+      col2im_gpu(col_diff, CHANNELS_, HEIGHT_, WIDTH_, KSIZE_, PAD_, STRIDE_,
+          bottom_diff + (*bottom)[0]->offset(n));
+    }
+  }
+  return Dtype(0.);
+}
+
+
+INSTANTIATE_CLASS(ConvolutionLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index f973a56..cc03cdb 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -227,23 +227,6 @@ void DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       reinterpret_cast<void*>(this))) << "Pthread execution failed.";
 }
 
-template <typename Dtype>
-void DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  // First, join the thread
-  CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
-  // Copy the data
-  CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(),
-      prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(),
-      cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(),
-      prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(),
-      cudaMemcpyHostToDevice));
-  // Start a new prefetch thread
-  CHECK(!pthread_create(&thread_, NULL, DataLayerPrefetch<Dtype>,
-      reinterpret_cast<void*>(this))) << "Pthread execution failed.";
-}
-
 // The backward operations are dummy - they do not carry any computation.
 template <typename Dtype>
 Dtype DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
@@ -251,12 +234,6 @@ Dtype DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0.);
 }
 
-template <typename Dtype>
-Dtype DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(DataLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/data_layer.cu b/src/caffe/layers/data_layer.cu
new file mode 100644
index 0000000..946f30f
--- /dev/null
+++ b/src/caffe/layers/data_layer.cu
@@ -0,0 +1,44 @@
+// Copyright 2013 Yangqing Jia
+
+#include <stdint.h>
+#include <leveldb/db.h>
+#include <pthread.h>
+
+#include <string>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+using std::string;
+
+namespace caffe {
+
+template <typename Dtype>
+void DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  // First, join the thread
+  CHECK(!pthread_join(thread_, NULL)) << "Pthread joining failed.";
+  // Copy the data
+  CUDA_CHECK(cudaMemcpy((*top)[0]->mutable_gpu_data(),
+      prefetch_data_->cpu_data(), sizeof(Dtype) * prefetch_data_->count(),
+      cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy((*top)[1]->mutable_gpu_data(),
+      prefetch_label_->cpu_data(), sizeof(Dtype) * prefetch_label_->count(),
+      cudaMemcpyHostToDevice));
+  // Start a new prefetch thread
+  CHECK(!pthread_create(&thread_, NULL, DataLayerPrefetch<Dtype>,
+      reinterpret_cast<void*>(this))) << "Pthread execution failed.";
+}
+
+// The backward operations are dummy - they do not carry any computation.
+template <typename Dtype>
+Dtype DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(DataLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
new file mode 100644
index 0000000..4e1fbfa
--- /dev/null
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -0,0 +1,63 @@
+// Copyright 2013 Yangqing Jia
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void DropoutLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  NeuronLayer<Dtype>::SetUp(bottom, top);
+  // Set up the cache for random number generation
+  rand_vec_.reset(new SyncedMemory(bottom[0]->count() * sizeof(int)));
+  threshold_ = this->layer_param_.dropout_ratio();
+  DCHECK(threshold_ > 0.);
+  DCHECK(threshold_ < 1.);
+  scale_ = 1. / (1. - threshold_);
+  uint_thres_ = (unsigned int)(UINT_MAX * threshold_);
+}
+
+template <typename Dtype>
+void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+  const int count = bottom[0]->count();
+  if (Caffe::phase() == Caffe::TRAIN) {
+    // Create random numbers
+    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
+        count, mask, 1. - threshold_);
+    for (int i = 0; i < count; ++i) {
+      top_data[i] = bottom_data[i] * mask[i] * scale_;
+    }
+  } else {
+    memcpy(top_data, bottom_data, bottom[0]->count() * sizeof(Dtype));
+  }
+}
+
+template <typename Dtype>
+Dtype DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  CHECK(Caffe::phase() == Caffe::TRAIN);
+  if (propagate_down) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int* mask = reinterpret_cast<const int*>(rand_vec_->cpu_data());
+    const int count = (*bottom)[0]->count();
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+    }
+  }
+  return Dtype(0);
+}
+
+
+INSTANTIATE_CLASS(DropoutLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
index efba295..2b6a68b 100644
--- a/src/caffe/layers/dropout_layer.cu
+++ b/src/caffe/layers/dropout_layer.cu
@@ -13,54 +13,6 @@ using std::max;
 
 namespace caffe {
 
-template <typename Dtype>
-void DropoutLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  NeuronLayer<Dtype>::SetUp(bottom, top);
-  // Set up the cache for random number generation
-  rand_vec_.reset(new SyncedMemory(bottom[0]->count() * sizeof(int)));
-  threshold_ = this->layer_param_.dropout_ratio();
-  DCHECK(threshold_ > 0.);
-  DCHECK(threshold_ < 1.);
-  scale_ = 1. / (1. - threshold_);
-  uint_thres_ = (unsigned int)(UINT_MAX * threshold_);
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
-  const int count = bottom[0]->count();
-  if (Caffe::phase() == Caffe::TRAIN) {
-    // Create random numbers
-    viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, Caffe::vsl_stream(),
-        count, mask, 1. - threshold_);
-    for (int i = 0; i < count; ++i) {
-      top_data[i] = bottom_data[i] * mask[i] * scale_;
-    }
-  } else {
-    memcpy(top_data, bottom_data, bottom[0]->count() * sizeof(Dtype));
-  }
-}
-
-template <typename Dtype>
-Dtype DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  CHECK(Caffe::phase() == Caffe::TRAIN);
-  if (propagate_down) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const int* mask = reinterpret_cast<const int*>(rand_vec_->cpu_data());
-    const int count = (*bottom)[0]->count();
-    for (int i = 0; i < count; ++i) {
-      bottom_diff[i] = top_diff[i] * mask[i] * scale_;
-    }
-  }
-  return Dtype(0);
-}
 
 template <typename Dtype>
 __global__ void DropoutForward(const int n, const Dtype* in,
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index bedf296..9e17a82 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -30,14 +30,6 @@ void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  caffe_gpu_copy(count_, bottom_data, top_data);
-}
-
-template <typename Dtype>
 Dtype FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -46,16 +38,6 @@ Dtype FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0.);
 }
 
-
-template <typename Dtype>
-Dtype FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-  caffe_gpu_copy(count_, top_diff, bottom_diff);
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(FlattenLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cu b/src/caffe/layers/flatten_layer.cu
new file mode 100644
index 0000000..571e22e
--- /dev/null
+++ b/src/caffe/layers/flatten_layer.cu
@@ -0,0 +1,30 @@
+// Copyright 2013 Yangqing Jia
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void FlattenLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  caffe_gpu_copy(count_, bottom_data, top_data);
+}
+
+template <typename Dtype>
+Dtype FlattenLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+  caffe_gpu_copy(count_, top_diff, bottom_diff);
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(FlattenLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 11b7d29..c31213e 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -65,28 +65,6 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const int batchsize = this->layer_param_.batchsize();
-  for (int i = 0; i < batchsize; ++i, ++current_row) {
-    if (current_row == data_dims[0]) {
-      current_row = 0;
-    }
-
-    CUDA_CHECK(cudaMemcpy(
-            &(*top)[0]->mutable_gpu_data()[i * data_dims[1]],
-            &(data.get()[current_row * data_dims[1]]),
-            sizeof(Dtype) * data_dims[1],
-            cudaMemcpyHostToDevice));
-
-    CUDA_CHECK(cudaMemcpy(
-            &(*top)[1]->mutable_gpu_data()[i * label_dims[1]],
-            &(label.get()[current_row * label_dims[1]]),
-            sizeof(Dtype) * label_dims[1],
-            cudaMemcpyHostToDevice));
-  }
-}
 
 // The backward operations are dummy - they do not carry any computation.
 template <typename Dtype>
@@ -95,12 +73,6 @@ Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0.);
 }
 
-template <typename Dtype>
-Dtype HDF5DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(HDF5DataLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
new file mode 100644
index 0000000..1ecf149
--- /dev/null
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -0,0 +1,53 @@
+// Copyright Sergey Karayev 2014
+/*
+TODO:
+- only load parts of the file, in accordance with a prototxt param "max_mem"
+*/
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "hdf5.h"
+#include "hdf5_hl.h"
+
+#include "caffe/layer.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/vision_layers.hpp"
+
+using std::string;
+
+namespace caffe {
+
+template <typename Dtype>
+void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const int batchsize = this->layer_param_.batchsize();
+  for (int i = 0; i < batchsize; ++i, ++current_row) {
+    if (current_row == data_dims[0]) {
+      current_row = 0;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+            &(*top)[0]->mutable_gpu_data()[i * data_dims[1]],
+            &(data.get()[current_row * data_dims[1]]),
+            sizeof(Dtype) * data_dims[1],
+            cudaMemcpyHostToDevice));
+
+    CUDA_CHECK(cudaMemcpy(
+            &(*top)[1]->mutable_gpu_data()[i * label_dims[1]],
+            &(label.get()[current_row * label_dims[1]]),
+            sizeof(Dtype) * label_dims[1],
+            cudaMemcpyHostToDevice));
+  }
+}
+
+template <typename Dtype>
+Dtype HDF5DataLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  return Dtype(0.);
+}
+
+INSTANTIATE_CLASS(HDF5DataLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index a94209b..e711713 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -37,17 +37,6 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
-        WIDTH_, KSIZE_, PAD_, STRIDE_, top_data + (*top)[0]->offset(n));
-  }
-}
-
-template <typename Dtype>
 Dtype Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -59,19 +48,6 @@ Dtype Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0.);
 }
 
-
-template <typename Dtype>
-Dtype Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), CHANNELS_, HEIGHT_,
-        WIDTH_, KSIZE_, PAD_, STRIDE_, bottom_diff + (*bottom)[0]->offset(n));
-  }
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(Im2colLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
new file mode 100644
index 0000000..2d949b1
--- /dev/null
+++ b/src/caffe/layers/im2col_layer.cu
@@ -0,0 +1,38 @@
+// Copyright 2013 Yangqing Jia
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/util/im2col.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/common.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_,
+        WIDTH_, KSIZE_, PAD_, STRIDE_, top_data + (*top)[0]->offset(n));
+  }
+}
+
+template <typename Dtype>
+Dtype Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_gpu(top_diff + top[0]->offset(n), CHANNELS_, HEIGHT_,
+        WIDTH_, KSIZE_, PAD_, STRIDE_, bottom_diff + (*bottom)[0]->offset(n));
+  }
+  return Dtype(0.);
+}
+
+
+INSTANTIATE_CLASS(Im2colLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index d770e23a..6987a78 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -2,7 +2,6 @@
 
 
 #include <mkl.h>
-#include <cublas_v2.h>
 
 #include <vector>
 
@@ -100,45 +99,6 @@ Dtype InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return Dtype(0);
 }
 
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = (*top)[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
-  if (biasterm_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
-  }
-}
-
-template <typename Dtype>
-Dtype InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
-  // Gradient with respect to weight
-  caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
-  if (biasterm_) {
-    // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-        (Dtype)0., this->blobs_[1]->mutable_gpu_diff());
-  }
-  if (propagate_down) {
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
-        (*bottom)[0]->mutable_gpu_diff());
-  }
-  return Dtype(0);
-}
-
 INSTANTIATE_CLASS(InnerProductLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
new file mode 100644
index 0000000..c7c3e2a
--- /dev/null
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -0,0 +1,59 @@
+// Copyright 2013 Yangqing Jia
+
+
+#include <mkl.h>
+#include <cublas_v2.h>
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = (*top)[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
+      bottom_data, weight, (Dtype)0., top_data);
+  if (biasterm_) {
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
+        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
+        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
+  }
+}
+
+template <typename Dtype>
+Dtype InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* bottom_data = (*bottom)[0]->gpu_data();
+  // Gradient with respect to weight
+  caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
+      top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
+  if (biasterm_) {
+    // Gradient with respect to bias
+    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
+        reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
+        (Dtype)0., this->blobs_[1]->mutable_gpu_diff());
+  }
+  if (propagate_down) {
+    // Gradient with respect to bottom data
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
+        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
+        (*bottom)[0]->mutable_gpu_diff());
+  }
+  return Dtype(0);
+}
+
+INSTANTIATE_CLASS(InnerProductLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/loss_layer.cu b/src/caffe/layers/loss_layer.cpp
similarity index 96%
rename from src/caffe/layers/loss_layer.cu
rename to src/caffe/layers/loss_layer.cpp
index 745bfa4..1c4303d 100644
--- a/src/caffe/layers/loss_layer.cu
+++ b/src/caffe/layers/loss_layer.cpp
@@ -42,7 +42,7 @@ Dtype MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
   Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     int label = static_cast<int>(bottom_label[i]);
-    Dtype prob = max(bottom_data[i * dim + label], kLOG_THRESHOLD);
+    Dtype prob = max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
     loss -= log(prob);
     bottom_diff[i * dim + label] = - 1. / prob / num;
   }
@@ -86,7 +86,7 @@ Dtype InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   for (int i = 0; i < num; ++i) {
     int label = static_cast<int>(bottom_label[i]);
     for (int j = 0; j < dim; ++j) {
-      Dtype prob = max(bottom_data[i * dim + j], kLOG_THRESHOLD);
+      Dtype prob = max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
       loss -= infogain_mat[label * dim + j] * log(prob);
       bottom_diff[i * dim + j] = - infogain_mat[label * dim + j] / prob / num;
     }
@@ -160,7 +160,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       ++accuracy;
     }
     Dtype prob = max(bottom_data[i * dim + static_cast<int>(bottom_label[i])],
-                     kLOG_THRESHOLD);
+                     Dtype(kLOG_THRESHOLD));
     logprob -= log(prob);
   }
   // LOG(INFO) << "Accuracy: " << accuracy;
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
new file mode 100644
index 0000000..478ed31
--- /dev/null
+++ b/src/caffe/layers/relu_layer.cpp
@@ -0,0 +1,42 @@
+// Copyright 2013 Yangqing Jia
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include <algorithm>
+
+using std::max;
+
+namespace caffe {
+
+template <typename Dtype>
+void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = max(bottom_data[i], Dtype(0));
+  }
+}
+
+template <typename Dtype>
+Dtype ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int count = (*bottom)[0]->count();
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0);
+    }
+  }
+  return Dtype(0);
+}
+
+
+INSTANTIATE_CLASS(ReLULayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu
index ed1aab4..e2e58d9 100644
--- a/src/caffe/layers/relu_layer.cu
+++ b/src/caffe/layers/relu_layer.cu
@@ -11,33 +11,6 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = max(bottom_data[i], Dtype(0));
-  }
-}
-
-template <typename Dtype>
-Dtype ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const int count = (*bottom)[0]->count();
-    for (int i = 0; i < count; ++i) {
-      bottom_diff[i] = top_diff[i] * (bottom_data[i] > 0);
-    }
-  }
-  return Dtype(0);
-}
-
-template <typename Dtype>
 __global__ void ReLUForward(const int n, const Dtype* in, Dtype* out) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n) {
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
new file mode 100644
index 0000000..112771f
--- /dev/null
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -0,0 +1,46 @@
+// Copyright 2014 Tobias Domhan
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include <algorithm>
+#include <cmath>
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = sigmoid(bottom_data[i]);
+  }
+}
+
+template <typename Dtype>
+Dtype SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int count = (*bottom)[0]->count();
+    for (int i = 0; i < count; ++i) {
+      Dtype sigmoid_x = sigmoid(bottom_data[i]);
+      bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
+    }
+  }
+  return Dtype(0);
+}
+
+INSTANTIATE_CLASS(SigmoidLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index e50260d..785d144 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -12,45 +12,10 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = sigmoid(bottom_data[i]);
-  }
-}
-
-template <typename Dtype>
-Dtype SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const int count = (*bottom)[0]->count();
-    for (int i = 0; i < count; ++i) {
-      Dtype sigmoid_x = sigmoid(bottom_data[i]);
-      bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
-    }
-  }
-  return Dtype(0);
-}
-
-
-template <typename Dtype>
 __device__ inline Dtype sigmoid_gpu(Dtype x) {
   return 1. / (1. + exp(-x));
 }
 
-
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
new file mode 100644
index 0000000..172094d
--- /dev/null
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -0,0 +1,86 @@
+// Copyright 2013 Yangqing Jia
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+using std::max;
+
+namespace caffe {
+
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_EQ(bottom.size(), 1) << "Softmax Layer takes a single blob as input.";
+  CHECK_EQ(top->size(), 1) << "Softmax Layer takes a single blob as output.";
+  (*top)[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  sum_multiplier_.Reshape(1, bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+  for (int i = 0; i < sum_multiplier_.count(); ++i) {
+    multiplier_data[i] = 1.;
+  }
+  scale_.Reshape(bottom[0]->num(), 1, 1, 1);
+}
+
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  memcpy(top_data, bottom_data, sizeof(Dtype) * bottom[0]->count());
+  // we need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  for (int i = 0; i < num; ++i) {
+    scale_data[i] = bottom_data[i*dim];
+    for (int j = 0; j < dim; ++j) {
+      scale_data[i] = max(scale_data[i], bottom_data[i * dim + j]);
+    }
+  }
+  // subtraction
+  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+    scale_data, sum_multiplier_.cpu_data(), 1., top_data);
+  // Perform exponentiation
+  caffe_exp<Dtype>(num * dim, top_data, top_data);
+  // sum after exp
+  caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_data,
+      sum_multiplier_.cpu_data(), 0., scale_data);
+  // Do division
+  for (int i = 0; i < num; ++i) {
+    caffe_scal<Dtype>(dim, Dtype(1.) / scale_data[i], top_data + i * dim);
+  }
+}
+
+template <typename Dtype>
+Dtype SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->cpu_data();
+  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  int num = top[0]->num();
+  int dim = top[0]->count() / top[0]->num();
+  memcpy(bottom_diff, top_diff, sizeof(Dtype) * top[0]->count());
+  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff
+  for (int i = 0; i < num; ++i) {
+    scale_data[i] = caffe_cpu_dot<Dtype>(dim, top_diff + i * dim,
+        top_data + i * dim);
+  }
+  // subtraction
+  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+      scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff);
+  // elementwise multiplication
+  caffe_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
+  return Dtype(0);
+}
+
+
+INSTANTIATE_CLASS(SoftmaxLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu
index af73260..fe2a89e 100644
--- a/src/caffe/layers/softmax_layer.cu
+++ b/src/caffe/layers/softmax_layer.cu
@@ -15,53 +15,6 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-void SoftmaxLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  CHECK_EQ(bottom.size(), 1) << "Softmax Layer takes a single blob as input.";
-  CHECK_EQ(top->size(), 1) << "Softmax Layer takes a single blob as output.";
-  (*top)[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  sum_multiplier_.Reshape(1, bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-  for (int i = 0; i < sum_multiplier_.count(); ++i) {
-    multiplier_data[i] = 1.;
-  }
-  scale_.Reshape(bottom[0]->num(), 1, 1, 1);
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  Dtype* scale_data = scale_.mutable_cpu_data();
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
-  memcpy(top_data, bottom_data, sizeof(Dtype) * bottom[0]->count());
-  // we need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  for (int i = 0; i < num; ++i) {
-    scale_data[i] = bottom_data[i*dim];
-    for (int j = 0; j < dim; ++j) {
-      scale_data[i] = max(scale_data[i], bottom_data[i * dim + j]);
-    }
-  }
-  // subtraction
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-    scale_data, sum_multiplier_.cpu_data(), 1., top_data);
-  // Perform exponentiation
-  caffe_exp<Dtype>(num * dim, top_data, top_data);
-  // sum after exp
-  caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_data,
-      sum_multiplier_.cpu_data(), 0., scale_data);
-  // Do division
-  for (int i = 0; i < num; ++i) {
-    caffe_scal<Dtype>(dim, Dtype(1.) / scale_data[i], top_data + i * dim);
-  }
-}
-
-template <typename Dtype>
 __global__ void kernel_get_max(const int num, const int dim,
     const Dtype* data, Dtype* out) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -125,30 +78,6 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       num, dim, scale_data, top_data);
 }
 
-template <typename Dtype>
-Dtype SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-  Dtype* scale_data = scale_.mutable_cpu_data();
-  int num = top[0]->num();
-  int dim = top[0]->count() / top[0]->num();
-  memcpy(bottom_diff, top_diff, sizeof(Dtype) * top[0]->count());
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff
-  for (int i = 0; i < num; ++i) {
-    scale_data[i] = caffe_cpu_dot<Dtype>(dim, top_diff + i * dim,
-        top_data + i * dim);
-  }
-  // subtraction
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-      scale_data, sum_multiplier_.cpu_data(), 1., bottom_diff);
-  // elementwise multiplication
-  caffe_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-  return Dtype(0);
-}
-
 // TODO(Yangqing): implement the GPU version of softmax.
 template <typename Dtype>
 Dtype SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
new file mode 100644
index 0000000..2ec7308
--- /dev/null
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -0,0 +1,59 @@
+// Copyright 2013 Yangqing Jia
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+using std::max;
+
+namespace caffe {
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  CHECK_EQ(bottom.size(), 2) << "SoftmaxLoss Layer takes two blobs as input.";
+  CHECK_EQ(top->size(), 0) << "SoftmaxLoss Layer takes no blob as output.";
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, &softmax_top_vec_);
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
+  // The forward pass computes the softmax prob values.
+  softmax_bottom_vec_[0] = bottom[0];
+  softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
+}
+
+template <typename Dtype>
+Dtype SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  // First, compute the diff
+  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  const Dtype* prob_data = prob_.cpu_data();
+  memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count());
+  const Dtype* label = (*bottom)[1]->cpu_data();
+  int num = prob_.num();
+  int dim = prob_.count() / num;
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    bottom_diff[i * dim + static_cast<int>(label[i])] -= 1;
+    loss += -log(max(prob_data[i * dim + static_cast<int>(label[i])], Dtype(FLT_MIN)));
+  }
+  // Scale down gradient
+  caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff);
+  return loss / num;
+}
+
+
+INSTANTIATE_CLASS(SoftmaxWithLossLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 3e26586..100393c 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -13,25 +13,6 @@ using std::max;
 namespace caffe {
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  CHECK_EQ(bottom.size(), 2) << "SoftmaxLoss Layer takes two blobs as input.";
-  CHECK_EQ(top->size(), 0) << "SoftmaxLoss Layer takes no blob as output.";
-  softmax_bottom_vec_.clear();
-  softmax_bottom_vec_.push_back(bottom[0]);
-  softmax_top_vec_.push_back(&prob_);
-  softmax_layer_->SetUp(softmax_bottom_vec_, &softmax_top_vec_);
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
-  // The forward pass computes the softmax prob values.
-  softmax_bottom_vec_[0] = bottom[0];
-  softmax_layer_->Forward(softmax_bottom_vec_, &softmax_top_vec_);
-}
-
-template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, vector<Blob<Dtype>*>* top) {
   // The forward pass computes the softmax prob values.
@@ -40,27 +21,6 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
 }
 
 template <typename Dtype>
-Dtype SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  // First, compute the diff
-  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-  const Dtype* prob_data = prob_.cpu_data();
-  memcpy(bottom_diff, prob_data, sizeof(Dtype) * prob_.count());
-  const Dtype* label = (*bottom)[1]->cpu_data();
-  int num = prob_.num();
-  int dim = prob_.count() / num;
-  Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    bottom_diff[i * dim + static_cast<int>(label[i])] -= 1;
-    loss += -log(max(prob_data[i * dim + static_cast<int>(label[i])], FLT_MIN));
-  }
-  // Scale down gradient
-  caffe_scal(prob_.count(), Dtype(1) / num, bottom_diff);
-  return loss / num;
-}
-
-template <typename Dtype>
 Dtype SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   // TODO(Yangqing): implement the GPU version of softmax.
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 56e9561..f9fc461 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -41,19 +41,6 @@ void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  for (int i = 0; i < top->size(); ++i) {
-    if (i == 0 && (*top)[i] == bottom[0]) {
-      continue;
-    }
-    Dtype* top_data = (*top)[i]->mutable_gpu_data();
-    caffe_gpu_copy(count_, bottom_data, top_data);
-  }
-}
-
-template <typename Dtype>
 Dtype SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
   if (propagate_down) {
@@ -75,27 +62,6 @@ Dtype SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 
-template <typename Dtype>
-Dtype SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
-    // Initialize by copying first top blob diff to our diff, unless we're
-    // doing in-place computation for the first blob, in which case the diff is
-    // already initialized.
-    if (top[0] != (*bottom)[0]) {
-      caffe_gpu_copy(count_, top_diff, bottom_diff);
-    }
-    // Add remaining top blob diffs.
-    for (int i = 1; i < top.size(); ++i) {
-      top_diff = top[i]->gpu_diff();
-      caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-    }
-  }
-  return Dtype(0.);
-}
-
 INSTANTIATE_CLASS(SplitLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu
new file mode 100644
index 0000000..5f25a46
--- /dev/null
+++ b/src/caffe/layers/split_layer.cu
@@ -0,0 +1,48 @@
+// Copyright 2014 Jeff Donahue
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  for (int i = 0; i < top->size(); ++i) {
+    if (i == 0 && (*top)[i] == bottom[0]) {
+      continue;
+    }
+    Dtype* top_data = (*top)[i]->mutable_gpu_data();
+    caffe_gpu_copy(count_, bottom_data, top_data);
+  }
+}
+
+template <typename Dtype>
+Dtype SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const bool propagate_down, vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_gpu_diff();
+    // Initialize by copying first top blob diff to our diff, unless we're
+    // doing in-place computation for the first blob, in which case the diff is
+    // already initialized.
+    if (top[0] != (*bottom)[0]) {
+      caffe_gpu_copy(count_, top_diff, bottom_diff);
+    }
+    // Add remaining top blob diffs.
+    for (int i = 1; i < top.size(); ++i) {
+      top_diff = top[i]->gpu_diff();
+      caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+    }
+  }
+  return Dtype(0.);
+}
+
+
+INSTANTIATE_CLASS(SplitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
new file mode 100644
index 0000000..d6f9956
--- /dev/null
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -0,0 +1,48 @@
+// Copyright 2014 Aravindh Mahendran
+// TanH neuron activation function layer.
+// Adapted from ReLU layer code written by Yangqing Jia
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+  Dtype exp2x;
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    exp2x = exp(2*bottom_data[i]);
+    top_data[i] = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+  }
+}
+
+template <typename Dtype>
+Dtype TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const bool propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  if (propagate_down) {
+    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+    const int count = (*bottom)[0]->count();
+    Dtype exp2x;
+    Dtype tanhx;
+    for (int i = 0; i < count; ++i) {
+      exp2x = exp(2*bottom_data[i]);
+      tanhx = (exp2x - Dtype(1))/(exp2x + Dtype(1));
+      bottom_diff[i] = top_diff[i] * (1 - tanhx*tanhx);
+    }
+  }
+  return Dtype(0);
+}
+
+INSTANTIATE_CLASS(TanHLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu
index a309a60..743e314 100644
--- a/src/caffe/layers/tanh_layer.cu
+++ b/src/caffe/layers/tanh_layer.cu
@@ -11,39 +11,6 @@
 namespace caffe {
 
 template <typename Dtype>
-void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    vector<Blob<Dtype>*>* top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = (*top)[0]->mutable_cpu_data();
-  Dtype exp2x;
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    exp2x = exp(2*bottom_data[i]);
-    top_data[i] = (exp2x - Dtype(1))/(exp2x + Dtype(1));
-  }
-}
-
-template <typename Dtype>
-Dtype TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const bool propagate_down,
-    vector<Blob<Dtype>*>* bottom) {
-  if (propagate_down) {
-    const Dtype* bottom_data = (*bottom)[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
-    const int count = (*bottom)[0]->count();
-    Dtype exp2x;
-    Dtype tanhx;
-    for (int i = 0; i < count; ++i) {
-      exp2x = exp(2*bottom_data[i]);
-      tanhx = (exp2x - Dtype(1))/(exp2x + Dtype(1));
-      bottom_diff[i] = top_diff[i] * (1 - tanhx*tanhx);
-    }
-  }
-  return Dtype(0);
-}
-
-template <typename Dtype>
 __global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n) {
-- 
2.7.4