#include "caffeine/filler.hpp"
#include "caffeine/layer.hpp"
#include "caffeine/vision_layers.hpp"
+#include "caffeine/util/blas.hpp"
namespace caffeine {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = (*top)[0]->mutable_cpu_data();
const Dtype* weight = this->blobs_[0].cpu_data();
- const Dtype* bias = NULL;
+ caffeine_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_, (Dtype)1.,
+ bottom_data, weight, (Dtype)0., top_data);
if (biasterm_) {
- bias = this->blobs_[1].cpu_data();
- }
- switch(sizeof(Dtype)) {
- case sizeof(float):
- // matrix multiply
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M_, N_, K_,
- 1., (const float*)bottom_data, K_, (const float*)weight, N_, 0.,
- (float*)top_data, N_);
- if (bias) {
- // add bias
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M_, N_, 1,
- 1., (const float*)bias_multiplier_->cpu_data(), 1,
- (const float*)bias, N_, 1., (float*)top_data, N_);
- }
- break;
- case sizeof(double):
- // matrix multiply
- cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M_, N_, K_,
- 1., (const double*)bottom_data, K_, (const double*)weight, N_, 0.,
- (double*)top_data, N_);
- if (bias) {
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M_, N_, 1,
- 1., (const float*)bias_multiplier_->cpu_data(), 1,
- (const float*)bias, N_, 1., (float*)top_data, N_);
- }
- break;
- default:
- CHECK(false) << "Unknown data type.";
+ caffeine_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
+ (Dtype*)bias_multiplier_->cpu_data(), this->blobs_[1].cpu_data(),
+ (Dtype)1., top_data);
}
}
if (sizeof(TypeParam) == 4 || CAFFEINE_TEST_CUDA_PROP.major >= 2) {
//[1,2,3; 4 5 6] * [1,2,3,4; 5,6,7,8; 9,10,11,12];
- decaf_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
+ caffeine_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
}
- decaf_gpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
+ caffeine_gpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
// Test when we have a transposed A
A.Reshape(1,1,3,2);
memcpy(A.mutable_cpu_data(), A_reshape_data, 6 * sizeof(TypeParam));
- decaf_cpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
+ caffeine_cpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
}
- decaf_gpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
+ caffeine_gpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
// Test when we have a transposed A and a transposed B too
B.Reshape(1,1,4,3);
memcpy(B.mutable_cpu_data(), B_reshape_data, 12 * sizeof(TypeParam));
- decaf_cpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
+ caffeine_cpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
}
- decaf_gpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
+ caffeine_gpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
// Test when we have a transposed B
A.Reshape(1,1,2,3);
memcpy(A.mutable_cpu_data(), data, 6 * sizeof(TypeParam));
- decaf_cpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
+ caffeine_cpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
}
- decaf_gpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
+ caffeine_gpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(C.cpu_data()[i], result[i]);
memcpy(x.mutable_cpu_data(), data, 3 * sizeof(TypeParam));
if (sizeof(TypeParam) == 4 || CAFFEINE_TEST_CUDA_PROP.major >= 2) {
- decaf_cpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.cpu_data(),
+ caffeine_cpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.cpu_data(),
x.cpu_data(), 0., y.mutable_cpu_data());
for (int i = 0; i < 2; ++i) {
EXPECT_EQ(y.cpu_data()[i], result_2[i]);
}
- decaf_gpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.gpu_data(),
+ caffeine_gpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.gpu_data(),
x.gpu_data(), 0., y.mutable_gpu_data());
for (int i = 0; i < 2; ++i) {
EXPECT_EQ(y.cpu_data()[i], result_2[i]);
// Test transpose case
memcpy(y.mutable_cpu_data(), data, 2 * sizeof(TypeParam));
- decaf_cpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.cpu_data(),
+ caffeine_cpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.cpu_data(),
y.cpu_data(), 0., x.mutable_cpu_data());
for (int i = 0; i < 3; ++i) {
EXPECT_EQ(x.cpu_data()[i], result_3[i]);
}
- decaf_gpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.gpu_data(),
+ caffeine_gpu_gemv<TypeParam>(CblasTrans, 2, 3, 1., A.gpu_data(),
y.gpu_data(), 0., x.mutable_gpu_data());
for (int i = 0; i < 3; ++i) {
EXPECT_EQ(x.cpu_data()[i], result_3[i]);
namespace caffeine {
template<>
-void decaf_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+void caffeine_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C) {
}
template<>
-void decaf_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+void caffeine_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C) {
}
template <>
-void decaf_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+void caffeine_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C) {
}
template <>
-void decaf_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+void caffeine_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const double alpha, const double* A, const double* B, const double beta,
double* C) {
}
template <>
-void decaf_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffeine_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
const int N, const float alpha, const float* A, const float* x,
const float beta, float* y) {
cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
}
template <>
-void decaf_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffeine_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
const int N, const double alpha, const double* A, const double* x,
const double beta, double* y) {
cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
}
template <>
-void decaf_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffeine_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
const int N, const float alpha, const float* A, const float* x,
const float beta, float* y) {
cublasOperation_t cuTransA =
}
template <>
-void decaf_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffeine_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
const int N, const double alpha, const double* A, const double* x,
const double beta, double* y) {
cublasOperation_t cuTransA =
// Decaf gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
template <typename Dtype>
-inline void decaf_cpu_gemm(const CBLAS_TRANSPOSE TransA,
+void caffeine_cpu_gemm(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
Dtype* C);
// gemm function - following the c convention and calling the fortran-order
// gpu code under the hood.
template <typename Dtype>
-void decaf_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+void caffeine_gpu_gemm(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
Dtype* C);
template <typename Dtype>
-void decaf_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+void caffeine_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
Dtype* y);
template <typename Dtype>
-void decaf_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+void caffeine_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
Dtype* y);