#include "caffe/layer_factory.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/math_functions.hpp"
-
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+extern unsigned int acl_log_flags;
+#endif //USE_PROFILING
/**
Forward declare boost::thread instead of including boost/thread.hpp
to avoid a boost/NVCC issues (#1009, #1010) on OSX.
namespace boost { class mutex; }
namespace caffe {
+bool AclEnableSchedule(int enable=1);
+#ifdef USE_PROFILING
+class logtime_util
+{
+ public:
+ logtime_util(int mask_, const char* information_){
+ mask = mask_;
+ if(acl_log_flags & mask){
+ strncpy(information, information_, 255);
+ gettimeofday(&tv[0], NULL);
+ }
+ }
+ ~logtime_util(){
+ if(acl_log_flags & mask){
+ long time[2];
+ gettimeofday(&tv[1], NULL);
+ time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+ time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+ printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV));
+ }
+ }
+ void log_time(bool start)
+ {
+ if(acl_log_flags & mask){
+ if (start){
+ gettimeofday(&tv[0], NULL);
+ }
+ else{
+ long time[2];
+ gettimeofday(&tv[1], NULL);
+ time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+ time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+ printf("%s %.6lf\n", information, (((double)time[1] - time[0]) / NANO_SEC_CONV));
+ }
+ }
+ }
+private:
+ struct timeval tv[2];
+ int mask;
+ char information[256];
+};
+
+#ifdef LAYER_PERF_STAT
+
+struct perf_stat {
+
+uint64_t total;
+uint32_t start;
+uint32_t end;
+uint32_t used;
+uint32_t count;
+
+perf_stat(): total(0),start(0),end(0),count(0){};
+
+};
+
+
+#endif
+#endif //USE_PROFILING
/**
* @brief An interface for the units of computation which can be composed into a
* layer.
*/
explicit Layer(const LayerParameter& param)
- : layer_param_(param), is_shared_(false) {
+ : layer_param_(param) {
// Set phase and copy blobs (if there are any).
phase_ = param.phase();
if (layer_param_.blobs_size() > 0) {
*/
void SetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- InitMutex();
CheckBlobCounts(bottom, top);
LayerSetUp(bottom, top);
Reshape(bottom, top);
const vector<Blob<Dtype>*>& top) {}
/**
- * @brief Whether a layer should be shared by multiple nets during data
- * parallelism. By default, all layers except for data layers should
- * not be shared. data layers should be shared to ensure each worker
- * solver access data sequentially during data parallelism.
- */
- virtual inline bool ShareInParallel() const { return false; }
-
- /** @brief Return whether this layer is actually shared by other nets.
- * If ShareInParallel() is true and using more than one GPU and the
- * net has TRAIN phase, then this function is expected return true.
- */
- inline bool IsShared() const { return is_shared_; }
-
- /** @brief Set whether this layer is actually shared by other nets
- * If ShareInParallel() is true and using more than one GPU and the
- * net has TRAIN phase, then is_shared should be set true.
- */
- inline void SetShared(bool is_shared) {
- CHECK(ShareInParallel() || !is_shared)
- << type() << "Layer does not support sharing.";
- is_shared_ = is_shared;
- }
-
- /**
* @brief Adjust the shapes of top blobs and internal buffers to accommodate
* the shapes of the bottom blobs.
*
*
* Your layer should implement Forward_cpu and (optionally) Forward_gpu.
*/
+#ifdef USE_PROFILING
+ Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+#else
inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
+#endif //USE_PROFILING
/**
* @brief Given the top blob error gradients, compute the bottom blob error
}
param_propagate_down_[param_id] = value;
}
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+ const vector<Blob<Dtype>*> * saved_top;
+ const vector<Blob<Dtype>*> * saved_bottom;
+ perf_stat * get_time_stat(void) { return &time_stat_;}
+ perf_stat * get_pmu_stat(int index) { return &pmu_stat_[index];}
+
+#endif
+
+#endif //USE_PROFILING
protected:
/** The protobuf that stores the layer parameters */
}
private:
- /** Whether this layer is actually shared by other nets*/
- bool is_shared_;
-
- /** The mutex for sequential forward if this layer is shared */
- shared_ptr<boost::mutex> forward_mutex_;
-
- /** Initialize forward_mutex_ */
- void InitMutex();
- /** Lock forward_mutex_ if this layer is shared */
- void Lock();
- /** Unlock forward_mutex_ if this layer is shared */
- void Unlock();
-
DISABLE_COPY_AND_ASSIGN(Layer);
+
+#ifdef USE_PROFILING
+#ifdef LAYER_PERF_STAT
+ perf_stat time_stat_;
+ perf_stat pmu_stat_[16];
+#endif
+#endif //USE_PROFILING
}; // class Layer
+
+#ifndef LAYER_PERF_STAT
// Forward and backward wrappers. You should implement the cpu and
// gpu specific implementations instead, and should not change these
// functions.
template <typename Dtype>
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
- // Lock during forward to ensure sequential forward
- Lock();
Dtype loss = 0;
Reshape(bottom, top);
switch (Caffe::mode()) {
default:
LOG(FATAL) << "Unknown caffe mode.";
}
- Unlock();
return loss;
}
+#endif
+
template <typename Dtype>
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,