From a8c9b66b7f62610d71a18c798d5eb7157d49420c Mon Sep 17 00:00:00 2001
From: Rowland Depp <depp.rowland@gmail.com>
Date: Tue, 11 Feb 2014 21:41:01 -0800
Subject: [PATCH] major refactoring allow coexistence of MKL and non-MKL cases

---
 Makefile                              |   8 ++
 Makefile.config.example               |   2 +
 include/caffe/util/math_functions.hpp |   7 +-
 include/caffe/util/mkl_alternate.hpp  |  95 +++++++++++++++++++++
 src/caffe/layers/loss_layer.cpp       |   2 +-
 src/caffe/solver.cpp                  |   2 +-
 src/caffe/util/math_functions.cpp     | 150 +++++-----------------------------
 7 files changed, 131 insertions(+), 135 deletions(-)
 create mode 100644 include/caffe/util/mkl_alternate.hpp
diff --git a/Makefile b/Makefile
index 6cc8f1e..488acb4 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,14 @@ LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \
 		$(foreach library,$(LIBRARIES),-l$(library))
 PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 
+# MKL options
+ifdef USE_MKL
+  LIBRARIES += mkl_rt
+  COMMON_FLAGS += -DUSE_MKL
+else
+  LIBRARIES += atlas cblas
+endif
+
 
 ##############################
 # Define build targets
diff --git a/Makefile.config.example b/Makefile.config.example
index cec85e0..0ec2eea 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -10,6 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
     -gencode arch=compute_30,code=sm_30 \
     -gencode arch=compute_35,code=sm_35
 
+# If not using MKL, comment out the following line.
+# USE_MKL=1
 # MKL directory contains include/ and lib/ directions that we need.
 MKL_DIR := /opt/intel/mkl
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 1ff8a77..db19acc 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -4,10 +4,11 @@
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
-//#include <mkl.h>
-#include <cblas.h>
+
 #include <cublas_v2.h>
 
+#include "caffe/util/mkl_alternate.hpp"
+
 namespace caffe {
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
@@ -46,7 +47,7 @@ void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
     Dtype* Y);
 
 template <typename Dtype>
-void caffe_axpby(const int N, const Dtype alpha, const Dtype* X,
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
new file mode 100644
index 0000000..1c207c6
--- /dev/null
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -0,0 +1,95 @@
+// Copyright 2013 Rowland Depp
+
+#ifndef CAFFE_UTIL_MKL_ALTERNATE_H_
+#define CAFFE_UTIL_MKL_ALTERNATE_H_
+
+#ifdef USE_MKL
+
+#include <mkl.h>
+
+#else  // If use MKL, simply include the MKL header
+
+#include <cblas.h>
+#include <math.h>
+
+// Functions that caffe uses but are not present if MKL is not linked.
+
+// A simple way to define the vsl unary functions. The operation should
+// be in the form e.g. y[i] = sqrt(a[i])
+#define DEFINE_VSL_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, float* y) { \
+    v##name<float>(n, a, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, double* y) { \
+    v##name<double>(n, a, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
+
+// A simple way to define the vsl unary functions with singular parameter b.
+// The operation should be in the form e.g. y[i] = pow(a[i], b)
+#define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const float b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+
+// A simple way to define the vsl binary functions. The operation should
+// be in the form e.g. y[i] = a[i] + b[i]
+#define DEFINE_VSL_BINARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(a); CHECK(b); CHECK(y); \
+    for (int i = 0; i < n; ++i) { operation; } \
+  } \
+  inline void vs##name( \
+    const int n, const float* a, const float* b, float* y) { \
+    v##name<float>(n, a, b, y); \
+  } \
+  inline void vd##name( \
+      const int n, const double* a, const double* b, double* y) { \
+    v##name<double>(n, a, b, y); \
+  }
+
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+
+// In addition, MKL comes with an additional function axpby that is not present
+// in standard blas. We will simply use a two-step (inefficient, of course) way
+// to mimic that.
+inline void cblas_saxpby(const int N, const float alpha, const float* X,
+                         const int incX, const float beta, float* Y,
+                         const int incY) {
+  cblas_sscal(N, beta, Y, incY);
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_daxpby(const int N, const double alpha, const double* X,
+                         const int incX, const double beta, double* Y,
+                         const int incY) {
+  cblas_dscal(N, beta, Y, incY);
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+
+#endif  // USE_MKL
+#endif  // CAFFE_UTIL_MKL_ALTERNATE_H_
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 3c0f15f..ef0074d 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -154,7 +154,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   int count = (*bottom)[0]->count();
   int num = (*bottom)[0]->num();
   // Compute the gradient
-  caffe_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
+  caffe_cpu_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0),
       (*bottom)[0]->mutable_cpu_diff());
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index eb02485..fb46c4e 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -215,7 +215,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
       // Compute the value to history, and then copy them to the blob's diff.
       Dtype local_rate = rate * net_params_lr[param_id];
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-      caffe_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
           net_params[param_id]->cpu_diff(), momentum,
           history_[param_id]->mutable_cpu_data());
       if (local_decay) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index d0841e2..fb2b112 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -3,7 +3,6 @@
 
 #include <limits>
 //#include <mkl.h>
-#include <eigen3/Eigen/Dense>
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -13,23 +12,6 @@
 
 namespace caffe {
 
-// Operations on aligned memory are faster than on unaligned memory.
-// But unfortunately, the pointers passed in are not always aligned.
-// Therefore, the memory-aligned Eigen::Map objects that wrap them
-// cannot be assigned to. This happens in lrn_layer and makes
-// test_lrn_layer crash with segmentation fault.
-// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned.
-
-// Though the default map option is unaligned, making it explicit is no harm.
-//const int data_alignment = Eigen::Aligned; // how is data allocated ?
-const int data_alignment = Eigen::Unaligned;
-typedef Eigen::Array<float, 1, Eigen::Dynamic> float_array_t;
-typedef Eigen::Map<const float_array_t, data_alignment> const_map_vector_float_t;
-typedef Eigen::Map<float_array_t, data_alignment> map_vector_float_t;
-typedef Eigen::Array<double, 1, Eigen::Dynamic> double_array_t;
-typedef Eigen::Map<const double_array_t, data_alignment> const_map_vector_double_t;
-typedef Eigen::Map<double_array_t, data_alignment> map_vector_double_t;
-
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -126,7 +108,6 @@ template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 
-
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) {
@@ -194,186 +175,95 @@ void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  // y := a*x + b*y
-  //cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_float_t y_map(Y, N);
-  // Eigen produces optimized code using lasy evaluation
-  // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html
-  y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-    // y := a*x + b*y
-  //cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-  CHECK_GE(N, 0);
-  CHECK(X);
-  CHECK(Y);
-  map_vector_double_t y_map(Y, N);
-  y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta;
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) +
-      const_map_vector_float_t(b, n);
+  vsAdd(n, a, b, y);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdAdd(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) +
-      const_map_vector_double_t(b, n);
+  vdAdd(n, a, b, y);
 }
 
 template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) -
-      const_map_vector_float_t(b, n);
+  vsSub(n, a, b, y);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdSub(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) -
-      const_map_vector_double_t(b, n);
+  vdSub(n, a, b, y);
 }
 
 template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-       const_map_vector_float_t(b, n);
+  vsMul(n, a, b, y);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdMul(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) *
-      const_map_vector_double_t(b, n);
+  vdMul(n, a, b, y);
 }
 
 template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
     float* y) {
-  //vsDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) /
-      const_map_vector_float_t(b, n);
+  vsDiv(n, a, b, y);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
     double* y) {
-  //vdDiv(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(b);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n) /
-      const_map_vector_double_t(b, n);
+  vdDiv(n, a, b, y);
 }
 
 template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
     float* y) {
-  //vsPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b);
+  vsPowx(n, a, b, y);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
     double* y) {
-  //vdPowx(n, a, b, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b);
+  vdPowx(n, a, b, y);
 }
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm
-  // v?Sqr Performs element by element squaring of the vector.
-  //vsSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<float>(n, a, 2, y);
-  // TODO: which is faster?
-//  map_vector_float_t(y, n) = const_map_vector_float_t(a, n) *
-//      const_map_vector_float_t(a, n);
+  vsSqr(n, a, y);
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  //vdSqr(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  caffe_powx<double>(n, a, 2, y);
+  vdSqr(n, a, y);
 }
 
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  //vsExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp();
+  vsExp(n, a, y);
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  //vdExp(n, a, y);
-  CHECK_GE(n, 0);
-  CHECK(a);
-  CHECK(y);
-  map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp();
+  vdExp(n, a, y);
 }
 
 template <typename Dtype>
-- 
2.7.4