From 6f1de8bf5aa9078e65bf90faab518f7eb61dc461 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 19 Sep 2013 16:26:01 -0700 Subject: [PATCH] im2col --- .gitignore | 1 + src/Makefile | 4 +- src/caffeine/layers/dropout_layer.cu | 2 + src/caffeine/layers/im2col_layer.cpp | 12 +++++ src/caffeine/layers/padding_layer.cu | 1 + src/caffeine/test/test_im2col_layer.cpp | 15 ++++++ src/caffeine/util/im2col.cu | 87 +++++++++++++++++++++++++++++++++ src/caffeine/util/im2col.hpp | 8 +++ src/caffeine/vision_layers.hpp | 4 +- 9 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 src/caffeine/util/im2col.cu diff --git a/.gitignore b/.gitignore index f4489a6..ca27edc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.slo *.lo *.o +*.cuo # Compiled Dynamic libraries *.so diff --git a/src/Makefile b/src/Makefile index cd89c7d..2b53501 100644 --- a/src/Makefile +++ b/src/Makefile @@ -17,7 +17,7 @@ PROTO_GEN_HEADER := ${PROTO_SRCS:.proto=.pb.h} PROTO_GEN_CC := ${PROTO_SRCS:.proto=.pb.cc} PROTO_GEN_PY := ${PROTO_SRCS:.proto=_pb2.py} CXX_OBJS := ${CXX_SRCS:.cpp=.o} -CU_OBJS := ${CU_SRCS:.cu=.o} +CU_OBJS := ${CU_SRCS:.cu=.cuo} PROTO_OBJS := ${PROTO_SRCS:.proto=.pb.o} OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS) TEST_OBJS := ${TEST_SRCS:.cpp=.o} @@ -63,7 +63,7 @@ $(TEST_BINS): %.testbin : %.o $(NAME): $(PROTO_GEN_CC) $(OBJS) $(LINK) -shared $(OBJS) -o $(NAME) -$(CU_OBJS): %.o: %.cu +$(CU_OBJS): %.cuo: %.cu $(NVCC) -c $< -o $@ $(PROTO_GEN_CC): $(PROTO_SRCS) diff --git a/src/caffeine/layers/dropout_layer.cu b/src/caffeine/layers/dropout_layer.cu index 9818907..2d9e260 100644 --- a/src/caffeine/layers/dropout_layer.cu +++ b/src/caffeine/layers/dropout_layer.cu @@ -82,6 +82,7 @@ void DropoutLayer::Forward_gpu(const vector*>& bottom, DropoutForward<<>>( count, bottom_data, (unsigned int*)rand_vec_->gpu_data(), uint_thres_, scale_, top_data); + CUDA_POST_KERNEL_CHECK; } else { CUDA_CHECK(cudaMemcpy(top_data, bottom_data, count * sizeof(Dtype), cudaMemcpyDeviceToDevice)); @@ -112,6 +113,7 @@ Dtype DropoutLayer::Backward_gpu(const vector*>& top, const int count = (*bottom)[0]->count(); DropoutBackward<<>>( count, top_diff, mask, uint_thres_, scale_, bottom_diff); + CUDA_POST_KERNEL_CHECK; } return Dtype(0); } diff --git a/src/caffeine/layers/im2col_layer.cpp b/src/caffeine/layers/im2col_layer.cpp index c6020e3..ee87493 100644 --- a/src/caffeine/layers/im2col_layer.cpp +++ b/src/caffeine/layers/im2col_layer.cpp @@ -1,6 +1,7 @@ #include "caffeine/layer.hpp" #include "caffeine/util/im2col.hpp" #include "caffeine/vision_layers.hpp" +#include "caffeine/common.hpp" namespace caffeine { @@ -30,6 +31,17 @@ void Im2colLayer::Forward_cpu(const vector*>& bottom, } template +void Im2colLayer::Forward_gpu(const vector*>& bottom, + vector*>* top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = (*top)[0]->mutable_gpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_gpu(bottom_data + bottom[0]->offset(n), CHANNELS_, HEIGHT_, + WIDTH_, KSIZE_, STRIDE_, top_data + (*top)[0]->offset(n)); + } +} + +template Dtype Im2colLayer::Backward_cpu(const vector*>& top, const bool propagate_down, vector*>* bottom) { const Dtype* top_diff = top[0]->cpu_diff(); diff --git a/src/caffeine/layers/padding_layer.cu b/src/caffeine/layers/padding_layer.cu index 2171467..53bd1f5 100644 --- a/src/caffeine/layers/padding_layer.cu +++ b/src/caffeine/layers/padding_layer.cu @@ -126,6 +126,7 @@ Dtype PaddingLayer::Backward_gpu(const vector*>& top, PaddingBackward<<>>( count, top_diff, bottom_diff, NUM_, CHANNEL_, HEIGHT_IN_, WIDTH_IN_, PAD_); + CUDA_POST_KERNEL_CHECK; } return Dtype(0); } diff --git a/src/caffeine/test/test_im2col_layer.cpp b/src/caffeine/test/test_im2col_layer.cpp index 7456dfb..e2b28e6 100644 --- a/src/caffeine/test/test_im2col_layer.cpp +++ b/src/caffeine/test/test_im2col_layer.cpp @@ -64,6 +64,21 @@ TYPED_TEST(Im2colLayerTest, TestCPU) { } } +TYPED_TEST(Im2colLayerTest, TestGPU) { + LayerParameter layer_param; + layer_param.set_kernelsize(3); + layer_param.set_stride(2); + Im2colLayer layer(layer_param); + Caffeine::set_mode(Caffeine::GPU); + layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_)); + layer.Forward(this->blob_bottom_vec_, &(this->blob_top_vec_)); + // We are lazy and will only check the top left block + for (int c = 0; c < 27; ++c) { + EXPECT_EQ(this->blob_bottom_->data_at(0, (c / 9), (c / 3) % 3, c % 3), + this->blob_top_->data_at(0, c, 0, 0)); + } +} + TYPED_TEST(Im2colLayerTest, TestCPUGradient) { LayerParameter layer_param; layer_param.set_kernelsize(3); diff --git a/src/caffeine/util/im2col.cu b/src/caffeine/util/im2col.cu new file mode 100644 index 0000000..ac9e845 --- /dev/null +++ b/src/caffeine/util/im2col.cu @@ -0,0 +1,87 @@ +#include +#include +#include + +#include "caffeine/common.hpp" +#include "caffeine/util/im2col.hpp" + +namespace caffeine { + +template +__global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, + const int height, const int width, const int ksize, + const int stride, const int height_col, const int width_col, Dtype* data_col) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < n) { + int w_out = index % width_col; + index /= width_col; + int h_out = index % height_col; + int channel_in = index / height_col; + int channel_out = channel_in * ksize * ksize; + int h_in = h_out * stride; + int w_in = w_out * stride; + data_col += (channel_out * height_col + h_out) * width_col + w_out; + data_im += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < ksize; ++i) { + for (int j = 0; j < ksize; ++j) { + *data_col = data_im[i * width + j]; + data_col += height_col * width_col; + } + } + } +} + +template +void im2col_gpu(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int stride, + Dtype* data_col) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int height_col = (height - ksize) / stride + 1; + int width_col = (width - ksize) / stride + 1; + int num_kernels = channels * height_col * width_col; + im2col_gpu_kernel<<>>( + num_kernels, data_im, height, width, ksize, stride, height_col, width_col, + data_col); + CUDA_POST_KERNEL_CHECK; +} + +// Explicit instantiation +template void im2col_gpu(const float* data_im, const int channels, + const int height, const int width, const int ksize, const int stride, + float* data_col); +template void im2col_gpu(const double* data_im, const int channels, + const int height, const int width, const int ksize, const int stride, + double* data_col); + +/* +template +void col2im_gpu(const Dtype* data_col, const int channels, + const int height, const int width, const int ksize, const int stride, + Dtype* data_im) { + memset(data_im, 0, sizeof(Dtype) * height * width * channels); + int height_col = (height - ksize) / stride + 1; + int width_col = (width - ksize) / stride + 1; + int channels_col = channels * ksize * ksize; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % ksize; + int h_offset = (c / ksize) % ksize; + int c_im = c / ksize / ksize; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + data_im[(c_im * height + h * stride + h_offset) * width + w * stride + + w_offset] += data_col[(c * height_col + h) * width_col + w]; + } + } + } +} + +// Explicit instantiation +template void col2im_gpu(const float* data_col, const int channels, + const int height, const int width, const int psize, const int stride, + float* data_im); +template void col2im_gpu(const double* data_col, const int channels, + const int height, const int width, const int psize, const int stride, + double* data_im); +*/ +} // namespace caffeine diff --git a/src/caffeine/util/im2col.hpp b/src/caffeine/util/im2col.hpp index 76f401d..f634990 100644 --- a/src/caffeine/util/im2col.hpp +++ b/src/caffeine/util/im2col.hpp @@ -13,7 +13,15 @@ void col2im_cpu(const Dtype* data_col, const int channels, const int height, const int width, const int psize, const int stride, Dtype* data_im); +template +void im2col_gpu(const Dtype* data_im, const int channels, + const int height, const int width, const int ksize, const int stride, + Dtype* data_col); +template +void col2im_gpu(const Dtype* data_col, const int channels, + const int height, const int width, const int psize, const int stride, + Dtype* data_im); } // namespace caffeine diff --git a/src/caffeine/vision_layers.hpp b/src/caffeine/vision_layers.hpp index d931bc2..2d24bf8 100644 --- a/src/caffeine/vision_layers.hpp +++ b/src/caffeine/vision_layers.hpp @@ -146,8 +146,8 @@ class Im2colLayer : public Layer { protected: virtual void Forward_cpu(const vector*>& bottom, vector*>* top); - //virtual void Forward_gpu(const vector*>& bottom, - // vector*>* top); + virtual void Forward_gpu(const vector*>& bottom, + vector*>* top); virtual Dtype Backward_cpu(const vector*>& top, const bool propagate_down, vector*>* bottom); //virtual Dtype Backward_gpu(const vector*>& top, -- 2.7.4