--- /dev/null
+#include <cstring>
+#include <cuda_runtime.h>
+#include <mkl.h>
+#include <cublas_v2.h>
+
+#include "gtest/gtest.h"
+#include "caffeine/blob.hpp"
+#include "caffeine/util/gemm.hpp"
+
+namespace caffeine {
+
+extern cudaDeviceProp CAFFEINE_TEST_CUDA_PROP;
+
+typedef ::testing::Types<float, double> Dtypes;
+
+template <typename Dtype>
+class GemmTest : public ::testing::Test {};
+
+TYPED_TEST_CASE(GemmTest, Dtypes);
+
+TYPED_TEST(GemmTest, TestGemm) {
+ Blob<TypeParam> A(1,1,2,3);
+ Blob<TypeParam> B(1,1,3,4);
+ Blob<TypeParam> C(1,1,2,4);
+ TypeParam data[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+ TypeParam A_reshape_data[6] = {1, 4, 2, 5, 3, 6};
+ TypeParam B_reshape_data[12] = {1,5,9,2,6,10,3,7,11,4,8,12};
+ TypeParam result[8] = {38,44,50,56,83,98,113,128};
+ memcpy(A.mutable_cpu_data(), data, 6 * sizeof(TypeParam));
+ memcpy(B.mutable_cpu_data(), data, 12 * sizeof(TypeParam));
+
+ if (sizeof(TypeParam) == 4 || CAFFEINE_TEST_CUDA_PROP.major >= 2) {
+ //[1,2,3; 4 5 6] * [1,2,3,4; 5,6,7,8; 9,10,11,12];
+ decaf_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
+ A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+ decaf_gpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
+ A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+
+ // Test when we have a transposed A
+ A.Reshape(1,1,3,2);
+ memcpy(A.mutable_cpu_data(), A_reshape_data, 6 * sizeof(TypeParam));
+ decaf_cpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
+ A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+ decaf_gpu_gemm<TypeParam>(CblasTrans, CblasNoTrans, 2, 4, 3, 1.,
+ A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+
+ // Test when we have a transposed A and a transposed B too
+ B.Reshape(1,1,4,3);
+ memcpy(B.mutable_cpu_data(), B_reshape_data, 12 * sizeof(TypeParam));
+ decaf_cpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
+ A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+ decaf_gpu_gemm<TypeParam>(CblasTrans, CblasTrans, 2, 4, 3, 1.,
+ A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+
+ // Test when we have a transposed B
+ A.Reshape(1,1,2,3);
+ memcpy(A.mutable_cpu_data(), data, 6 * sizeof(TypeParam));
+ decaf_cpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
+ A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+ decaf_gpu_gemm<TypeParam>(CblasNoTrans, CblasTrans, 2, 4, 3, 1.,
+ A.gpu_data(), B.gpu_data(), 0., C.mutable_gpu_data());
+ for (int i = 0; i < 8; ++i) {
+ EXPECT_EQ(C.cpu_data()[i], result[i]);
+ }
+ } else {
+ LOG(ERROR) << "Skipping test due to old architecture.";
+ }
+}
+
+
+}
--- /dev/null
+#include <mkl.h>
+#include <cublas_v2.h>
+#include "caffeine/common.hpp"
+#include "caffeine/util/gemm.hpp"
+
+namespace caffeine {
+
+template<>
+void decaf_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const float alpha, const float* A, const float* B, const float beta,
+ float* C) {
+ int lda = (TransA == CblasNoTrans) ? K : M;
+ int ldb = (TransB == CblasNoTrans) ? N : K;
+ cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+ ldb, beta, C, N);
+}
+
+template<>
+void decaf_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const double alpha, const double* A, const double* B, const double beta,
+ double* C) {
+ int lda = (TransA == CblasNoTrans) ? K : M;
+ int ldb = (TransB == CblasNoTrans) ? N : K;
+ cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+ ldb, beta, C, N);
+}
+
+template <>
+void decaf_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const float alpha, const float* A, const float* B, const float beta,
+ float* C) {
+ // Note that cublas follows fortran order.
+ int lda = (TransA == CblasNoTrans) ? K : M;
+ int ldb = (TransB == CblasNoTrans) ? N : K;
+ cublasOperation_t cuTransA =
+ (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasOperation_t cuTransB =
+ (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ CUBLAS_CHECK(cublasSgemm(Caffeine::cublas_handle(), cuTransB, cuTransA,
+ N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+template <>
+void decaf_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const double alpha, const double* A, const double* B, const double beta,
+ double* C) {
+ // Note that cublas follows fortran order.
+ int lda = (TransA == CblasNoTrans) ? K : M;
+ int ldb = (TransB == CblasNoTrans) ? N : K;
+ cublasOperation_t cuTransA =
+ (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ cublasOperation_t cuTransB =
+ (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+ CUBLAS_CHECK(cublasDgemm(Caffeine::cublas_handle(), cuTransA, cuTransB,
+ N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+
+} // namespace caffeine
\ No newline at end of file
--- /dev/null
+#ifndef CAFFEINE_UTIL_GEMM_H_
+#define CAFFEINE_UTIL_GEMM_H_
+
+#include <mkl.h>
+#include <cublas_v2.h>
+
+namespace caffeine {
+
+// Decaf gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <typename Dtype>
+inline void decaf_cpu_gemm(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+ Dtype* C);
+
+// Decaf gpu gemm provides an interface that is almost the same as the cpu
+// gemm function - following the c convention and calling the fortran-order
+// gpu code under the hood.
+template <typename Dtype>
+void decaf_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+ const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+ Dtype* C);
+
+} // namespace caffeine
+
+#endif // CAFFEINE_UTIL_GEMM_H_