HDF5DataLayer source is now a list of filenames

author Sergey Karayev <sergeykarayev@gmail.com>

Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)

committer Sergey Karayev <sergeykarayev@gmail.com>

Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)
author Sergey Karayev <sergeykarayev@gmail.com>
Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)
committer Sergey Karayev <sergeykarayev@gmail.com>
Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp

index d2daa98..4b14220 100644 (file)
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -400,11 +400,15 @@ class HDF5DataLayer : public Layer<Dtype> {
    virtual Dtype Backward_gpu(const vector<Blob<Dtype>*>& top,
        const bool propagate_down, vector<Blob<Dtype>*>* bottom);
  
+  std::vector<std::string> hdf_filenames_;
+  unsigned int num_files_;
+  unsigned int current_file_;
+  hsize_t current_row_;
+
    boost::scoped_ptr<Dtype> data_;
    boost::scoped_ptr<Dtype> label_;
    std::vector<hsize_t> data_dims_;
    std::vector<hsize_t> label_dims_;
-  hsize_t current_row_;
  };
  
  
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp

index 5ac594b..7f993a6 100644 (file)
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -4,11 +4,14 @@ Contributors:
  - Tobias Domhan, 2014.
  
  TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
+- load file in a separate thread ("prefetch")
+- can be smarter about the memcpy call instead of doing it row-by-row
  */
  #include <stdint.h>
  #include <string>
  #include <vector>
+#include <iostream>
+#include <fstream>
  
  #include "hdf5.h"
  #include "hdf5_hl.h"
@@ -61,9 +64,23 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
    CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs.";
    CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output.";
  
-  // Load the HDF5 file and initialize the counter.
-  const char* hdf_filename = this->layer_param_.source().c_str();
-  load_hdf5_file(hdf_filename);
+  // Read the source to parse the filenames.
+  LOG(INFO) << "Loading filename from " << this->layer_param_.source();
+  hdf_filenames_.clear();
+  std::ifstream myfile(this->layer_param_.source().c_str());
+  if (myfile.is_open()) {
+    string line = "";
+    while (myfile >> line) {
+      hdf_filenames_.push_back(line);
+    }
+  }
+  myfile.close();
+  num_files_ = hdf_filenames_.size();
+  current_file_ = 0;
+  LOG(INFO) << "Number of files: " << num_files_;
+
+  // Load the first HDF5 file and initialize the line counter.
+  load_hdf5_file(hdf_filenames_[current_file_].c_str());
    current_row_ = 0;
  
    // Reshape blobs.
@@ -83,10 +100,18 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const int data_count = (*top)[0]->count() / (*top)[0]->num();
    const int label_data_count = (*top)[1]->count() / (*top)[1]->num();
  
-  //TODO: consolidate into a single memcpy call
-
    for (int i = 0; i < batchsize; ++i, ++current_row_) {
      if (current_row_ == data_dims_[0]) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          LOG(INFO) << "looping around to first file";
+        }
+
+        load_hdf5_file(hdf_filenames_[current_file_].c_str());
+      }
        current_row_ = 0;
      }
  
@@ -100,7 +125,6 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    }
  }
  
-
  // The backward operations are dummy - they do not carry any computation.
  template <typename Dtype>
  Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu

index 773ebc3..7a31a60 100644 (file)
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -28,6 +28,16 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
  
    for (int i = 0; i < batchsize; ++i, ++current_row_) {
      if (current_row_ == data_dims_[0]) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          LOG(INFO) << "looping around to first file";
+        }
+
+        load_hdf5_file(hdf_filenames_[current_file_].c_str());
+      }
        current_row_ = 0;
      }
  
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py

index ab55305..0d8f5aa 100644 (file)
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ b/src/caffe/test/test_data/generate_sample_data.py
@@ -1,7 +1,7 @@
  """
  Generate data used in the HDF5DataLayer test.
  """
-
+import os
  import numpy as np
  import h5py
  
@@ -9,11 +9,31 @@ num_cols = 8
  num_rows = 10
  height = 5
  width = 5
-data = np.arange(num_cols * num_rows * height * width).reshape(num_rows, num_cols, height, width)
+total_size = num_cols * num_rows * height * width
+
+data = np.arange(total_size)
+data = data.reshape(num_rows, num_cols, height, width)
+data = data.astype('float32')
  label = np.arange(num_rows)[:, np.newaxis]
+label = label.astype('float32')
+
  print data
  print label
  
-with h5py.File('./sample_data.h5', 'w') as f:
-    f['data'] = data.astype('float32')
-    f['label'] = label.astype('float32')
+with h5py.File(os.path.dirname(__file__) + '/sample_data.h5', 'w') as f:
+    f['data'] = data
+    f['label'] = label
+
+with h5py.File(os.path.dirname(__file__) + '/sample_data_2_gzip.h5', 'w') as f:
+    f.create_dataset(
+        'data', data=data + total_size,
+        compression='gzip', compression_opts=1
+    )
+    f.create_dataset(
+        'label', data=label,
+        compression='gzip', compression_opts=1
+    )
+
+with open(os.path.dirname(__file__) + '/sample_data_list.txt', 'w') as f:
+    f.write(os.path.dirname(__file__) + '/sample_data.h5\n')
+    f.write(os.path.dirname(__file__) + '/sample_data_2_gzip.h5\n')
diff --git a/src/caffe/test/test_data/sample_data.h5 b/src/caffe/test/test_data/sample_data.h5

index db245ba..a1f923a 100644 (file)

Binary files a/src/caffe/test/test_data/sample_data.h5 and b/src/caffe/test/test_data/sample_data.h5 differ
diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5

new file mode 100644 (file)

index 0000000..56c0a74

Binary files /dev/null and b/src/caffe/test/test_data/sample_data_2_gzip.h5 differ
diff --git a/src/caffe/test/test_data/sample_data_list.txt b/src/caffe/test/test_data/sample_data_list.txt

new file mode 100644 (file)

index 0000000..cdf343f
--- /dev/null
+++ b/src/caffe/test/test_data/sample_data_list.txt
@@ -0,0 +1,2 @@
+src/caffe/test/test_data/sample_data.h5
+src/caffe/test/test_data/sample_data_2_gzip.h5
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp

index d4f3c4a..59aee0c 100644 (file)
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -31,10 +31,8 @@ class HDF5DataLayerTest : public ::testing::Test {
      blob_top_vec_.push_back(blob_top_data_);
      blob_top_vec_.push_back(blob_top_label_);
  
-    // TODO: generate sample HDF5 file on the fly.
-    // For now, use example HDF5 file.
-    // TODO: how to best deal with the relativeness of the path?
-    filename = new string("src/caffe/test/test_data/sample_data.h5");
+    // Check out generate_sample_data.py in the same directory.
+    filename = new string("src/caffe/test/test_data/sample_data_list.txt");
      LOG(INFO) << "Using sample HDF5 data file " << filename;
    }
  
@@ -80,62 +78,43 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
    EXPECT_EQ(this->blob_top_label_->height(), 1);
    EXPECT_EQ(this->blob_top_label_->width(), 1);
  
-  const int data_size = num_cols * height * width;
-
-  // Go through the data 100 times.
-  for (int iter = 0; iter < 100; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
-    // On even iterations, we're reading the first half of the data.
-    // On odd iterations, we're reading the second half of the data.
-    int label_offset = (iter % 2 == 0) ? 0 : batchsize;
-    int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
-    for (int i = 0; i < batchsize; ++i) {
-      EXPECT_EQ(
-        label_offset + i,
-        this->blob_top_label_->cpu_data()[i]);
-    }
-    for (int i = 0; i < batchsize; ++i) {
-      for (int j = 0; j < num_cols; ++j) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            int idx = i * num_cols * height * width + j * height * width + h * width + w;
-            EXPECT_EQ(
-              data_offset + idx,
-              this->blob_top_data_->cpu_data()[idx])
-              << "debug: i " << i << " j " << j;
-          }
-        }
-      }
+  for (int t=0; t<2; ++t) {
+    if (t == 0) {
+      Caffe::set_mode(Caffe::CPU);
+    } else {
+      Caffe::set_mode(Caffe::GPU);
      }
-  }
  
-  // Exact same test in GPU mode.
-  Caffe::set_mode(Caffe::GPU);
-  // Go through the data 100 times.
-  for (int iter = 0; iter < 100; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
-    // On even iterations, we're reading the first half of the data.
-    // On odd iterations, we're reading the second half of the data.
-    int label_offset = (iter % 2 == 0) ? 0 : batchsize;
-    int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
-    for (int i = 0; i < batchsize; ++i) {
-      EXPECT_EQ(
-        label_offset + i,
-        this->blob_top_label_->cpu_data()[i]);
-    }
-    for (int i = 0; i < batchsize; ++i) {
-      for (int j = 0; j < num_cols; ++j) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            int idx = i * num_cols * height * width + j * height * width + h * width + w;
-            EXPECT_EQ(
-              data_offset + idx,
-              this->blob_top_data_->cpu_data()[idx])
-              << "debug: i " << i << " j " << j;
+    // Go through the data 100 times (50 batches).
+    const int data_size = num_cols * height * width;
+    for (int iter = 0; iter < 100; ++iter) {
+      layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
+
+      // On even iterations, we're reading the first half of the data.
+      // On odd iterations, we're reading the second half of the data.
+      int label_offset = (iter % 2 == 0) ? 0 : batchsize;
+      int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
+
+      // Every two iterations we are reading the second file,
+      // which has the same labels, but data is offset by total data size,
+      // which is 2000 (see generate_sample_data).
+      int file_offset = (iter % 4 < 2) ? 0 : 2000;
+
+      for (int i = 0; i < batchsize; ++i) {
+        EXPECT_EQ(
+          label_offset + i,
+          this->blob_top_label_->cpu_data()[i]);
+      }
+      for (int i = 0; i < batchsize; ++i) {
+        for (int j = 0; j < num_cols; ++j) {
+          for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+              int idx = i * num_cols * height * width + j * height * width + h * width + w;
+              EXPECT_EQ(
+                file_offset + data_offset + idx,
+                this->blob_top_data_->cpu_data()[idx])
+                << "debug: i " << i << " j " << j << " iter " << iter;
+            }
            }
          }
        }
author	Sergey Karayev <sergeykarayev@gmail.com>
	Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)
committer	Sergey Karayev <sergeykarayev@gmail.com>
	Mon, 17 Mar 2014 02:55:59 +0000 (19:55 -0700)
include/caffe/vision_layers.hpp		patch \| blob \| history
src/caffe/layers/hdf5_data_layer.cpp		patch \| blob \| history
src/caffe/layers/hdf5_data_layer.cu		patch \| blob \| history
src/caffe/test/test_data/generate_sample_data.py		patch \| blob \| history
src/caffe/test/test_data/sample_data.h5		patch \| blob \| history
src/caffe/test/test_data/sample_data_2_gzip.h5	[new file with mode: 0644]	patch \| blob
src/caffe/test/test_data/sample_data_list.txt	[new file with mode: 0644]	patch \| blob
src/caffe/test/test_hdf5data_layer.cpp		patch \| blob \| history