From a8c9b66b7f62610d71a18c798d5eb7157d49420c Mon Sep 17 00:00:00 2001 From: Rowland Depp Date: Tue, 11 Feb 2014 21:41:01 -0800 Subject: [PATCH] major refactoring allow coexistence of MKL and non-MKL cases --- Makefile | 8 ++ Makefile.config.example | 2 + include/caffe/util/math_functions.hpp | 7 +- include/caffe/util/mkl_alternate.hpp | 95 +++++++++++++++++++++ src/caffe/layers/loss_layer.cpp | 2 +- src/caffe/solver.cpp | 2 +- src/caffe/util/math_functions.cpp | 150 +++++----------------------------- 7 files changed, 131 insertions(+), 135 deletions(-) create mode 100644 include/caffe/util/mkl_alternate.hpp diff --git a/Makefile b/Makefile index 6cc8f1e..488acb4 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,14 @@ LDFLAGS += $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) \ $(foreach library,$(LIBRARIES),-l$(library)) PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library)) +# MKL options +ifdef USE_MKL + LIBRARIES += mkl_rt + COMMON_FLAGS += -DUSE_MKL +else + LIBRARIES += atlas cblas +endif + ############################## # Define build targets diff --git a/Makefile.config.example b/Makefile.config.example index cec85e0..0ec2eea 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -10,6 +10,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ -gencode arch=compute_30,code=sm_30 \ -gencode arch=compute_35,code=sm_35 +# If not using MKL, comment out the following line. +# USE_MKL=1 # MKL directory contains include/ and lib/ directions that we need. MKL_DIR := /opt/intel/mkl diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 1ff8a77..db19acc 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -4,10 +4,11 @@ #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ #define CAFFE_UTIL_MATH_FUNCTIONS_H_ -//#include -#include + #include +#include "caffe/util/mkl_alternate.hpp" + namespace caffe { // Decaf gemm provides a simpler interface to the gemm functions, with the @@ -46,7 +47,7 @@ void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); template -void caffe_axpby(const int N, const Dtype alpha, const Dtype* X, +void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, Dtype* Y); template diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp new file mode 100644 index 0000000..1c207c6 --- /dev/null +++ b/include/caffe/util/mkl_alternate.hpp @@ -0,0 +1,95 @@ +// Copyright 2013 Rowland Depp + +#ifndef CAFFE_UTIL_MKL_ALTERNATE_H_ +#define CAFFE_UTIL_MKL_ALTERNATE_H_ + +#ifdef USE_MKL + +#include + +#else // If use MKL, simply include the MKL header + +#include +#include + +// Functions that caffe uses but are not present if MKL is not linked. + +// A simple way to define the vsl unary functions. The operation should +// be in the form e.g. y[i] = sqrt(a[i]) +#define DEFINE_VSL_UNARY_FUNC(name, operation) \ + template \ + void v##name(const int n, const Dtype* a, Dtype* y) { \ + CHECK_GT(n, 0); CHECK(a); CHECK(y); \ + for (int i = 0; i < n; ++i) { operation; } \ + } \ + inline void vs##name( \ + const int n, const float* a, float* y) { \ + v##name(n, a, y); \ + } \ + inline void vd##name( \ + const int n, const double* a, double* y) { \ + v##name(n, a, y); \ + } + +DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]); +DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i])); + +// A simple way to define the vsl unary functions with singular parameter b. +// The operation should be in the form e.g. y[i] = pow(a[i], b) +#define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \ + template \ + void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \ + CHECK_GT(n, 0); CHECK(a); CHECK(y); \ + for (int i = 0; i < n; ++i) { operation; } \ + } \ + inline void vs##name( \ + const int n, const float* a, const float b, float* y) { \ + v##name(n, a, b, y); \ + } \ + inline void vd##name( \ + const int n, const double* a, const float b, double* y) { \ + v##name(n, a, b, y); \ + } + +DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b)); + +// A simple way to define the vsl binary functions. The operation should +// be in the form e.g. y[i] = a[i] + b[i] +#define DEFINE_VSL_BINARY_FUNC(name, operation) \ + template \ + void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \ + CHECK_GT(n, 0); CHECK(a); CHECK(b); CHECK(y); \ + for (int i = 0; i < n; ++i) { operation; } \ + } \ + inline void vs##name( \ + const int n, const float* a, const float* b, float* y) { \ + v##name(n, a, b, y); \ + } \ + inline void vd##name( \ + const int n, const double* a, const double* b, double* y) { \ + v##name(n, a, b, y); \ + } + +DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]); +DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]); +DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]); +DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]); + +// In addition, MKL comes with an additional function axpby that is not present +// in standard blas. We will simply use a two-step (inefficient, of course) way +// to mimic that. +inline void cblas_saxpby(const int N, const float alpha, const float* X, + const int incX, const float beta, float* Y, + const int incY) { + cblas_sscal(N, beta, Y, incY); + cblas_saxpy(N, alpha, X, incX, Y, incY); +} +inline void cblas_daxpby(const int N, const double alpha, const double* X, + const int incX, const double beta, double* Y, + const int incY) { + cblas_dscal(N, beta, Y, incY); + cblas_daxpy(N, alpha, X, incX, Y, incY); +} + +#endif // USE_MKL +#endif // CAFFE_UTIL_MKL_ALTERNATE_H_ diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp index 3c0f15f..ef0074d 100644 --- a/src/caffe/layers/loss_layer.cpp +++ b/src/caffe/layers/loss_layer.cpp @@ -154,7 +154,7 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, int count = (*bottom)[0]->count(); int num = (*bottom)[0]->num(); // Compute the gradient - caffe_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0), + caffe_cpu_axpby(count, Dtype(1) / num, difference_.cpu_data(), Dtype(0), (*bottom)[0]->mutable_cpu_diff()); } diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index eb02485..fb46c4e 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -215,7 +215,7 @@ void SGDSolver::ComputeUpdateValue() { // Compute the value to history, and then copy them to the blob's diff. Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - caffe_axpby(net_params[param_id]->count(), local_rate, + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); if (local_decay) { diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index d0841e2..fb2b112 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -3,7 +3,6 @@ #include //#include -#include #include #include @@ -13,23 +12,6 @@ namespace caffe { -// Operations on aligned memory are faster than on unaligned memory. -// But unfortunately, the pointers passed in are not always aligned. -// Therefore, the memory-aligned Eigen::Map objects that wrap them -// cannot be assigned to. This happens in lrn_layer and makes -// test_lrn_layer crash with segmentation fault. -// TODO: Use aligned Eigen::Map when the pointer to be wrapped is aligned. - -// Though the default map option is unaligned, making it explicit is no harm. -//const int data_alignment = Eigen::Aligned; // how is data allocated ? -const int data_alignment = Eigen::Unaligned; -typedef Eigen::Array float_array_t; -typedef Eigen::Map const_map_vector_float_t; -typedef Eigen::Map map_vector_float_t; -typedef Eigen::Array double_array_t; -typedef Eigen::Map const_map_vector_double_t; -typedef Eigen::Map map_vector_double_t; - template<> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, @@ -126,7 +108,6 @@ template <> void caffe_axpy(const int N, const double alpha, const double* X, double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } - template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, float* Y) { @@ -194,186 +175,95 @@ void caffe_gpu_axpby(const int N, const double alpha, const double* X, } template <> -void caffe_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { - // y := a*x + b*y - //cblas_saxpby(N, alpha, X, 1, beta, Y, 1); - CHECK_GE(N, 0); - CHECK(X); - CHECK(Y); - map_vector_float_t y_map(Y, N); - // Eigen produces optimized code using lasy evaluation - // http://eigen.tuxfamily.org/dox/TopicLazyEvaluation.html - y_map = const_map_vector_float_t(X, N) * alpha + y_map * beta; +void caffe_cpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); } template <> -void caffe_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { - // y := a*x + b*y - //cblas_daxpby(N, alpha, X, 1, beta, Y, 1); - CHECK_GE(N, 0); - CHECK(X); - CHECK(Y); - map_vector_double_t y_map(Y, N); - y_map = const_map_vector_double_t(X, N) * alpha + y_map * beta; +void caffe_cpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); } template <> void caffe_add(const int n, const float* a, const float* b, float* y) { - //vsAdd(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n) + - const_map_vector_float_t(b, n); + vsAdd(n, a, b, y); } template <> void caffe_add(const int n, const double* a, const double* b, double* y) { - //vdAdd(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n) + - const_map_vector_double_t(b, n); + vdAdd(n, a, b, y); } template <> void caffe_sub(const int n, const float* a, const float* b, float* y) { - //vsSub(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n) - - const_map_vector_float_t(b, n); + vsSub(n, a, b, y); } template <> void caffe_sub(const int n, const double* a, const double* b, double* y) { - //vdSub(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n) - - const_map_vector_double_t(b, n); + vdSub(n, a, b, y); } template <> void caffe_mul(const int n, const float* a, const float* b, float* y) { - //vsMul(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n) * - const_map_vector_float_t(b, n); + vsMul(n, a, b, y); } template <> void caffe_mul(const int n, const double* a, const double* b, double* y) { - //vdMul(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n) * - const_map_vector_double_t(b, n); + vdMul(n, a, b, y); } template <> void caffe_div(const int n, const float* a, const float* b, float* y) { - //vsDiv(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n) / - const_map_vector_float_t(b, n); + vsDiv(n, a, b, y); } template <> void caffe_div(const int n, const double* a, const double* b, double* y) { - //vdDiv(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(b); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n) / - const_map_vector_double_t(b, n); + vdDiv(n, a, b, y); } template <> void caffe_powx(const int n, const float* a, const float b, float* y) { - //vsPowx(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n).pow(b); + vsPowx(n, a, b, y); } template <> void caffe_powx(const int n, const double* a, const double b, double* y) { - //vdPowx(n, a, b, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n).pow(b); + vdPowx(n, a, b, y); } template <> void caffe_sqr(const int n, const float* a, float* y) { - // http://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-F003F826-81BF-42EC-AE51-2EF624893133.htm - // v?Sqr Performs element by element squaring of the vector. - //vsSqr(n, a, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - caffe_powx(n, a, 2, y); - // TODO: which is faster? -// map_vector_float_t(y, n) = const_map_vector_float_t(a, n) * -// const_map_vector_float_t(a, n); + vsSqr(n, a, y); } template <> void caffe_sqr(const int n, const double* a, double* y) { - //vdSqr(n, a, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - caffe_powx(n, a, 2, y); + vdSqr(n, a, y); } template <> void caffe_exp(const int n, const float* a, float* y) { - //vsExp(n, a, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - map_vector_float_t(y, n) = const_map_vector_float_t(a, n).exp(); + vsExp(n, a, y); } template <> void caffe_exp(const int n, const double* a, double* y) { - //vdExp(n, a, y); - CHECK_GE(n, 0); - CHECK(a); - CHECK(y); - map_vector_double_t(y, n) = const_map_vector_double_t(a, n).exp(); + vdExp(n, a, y); } template -- 2.7.4