(float*)(top_data) + (N_ * i), 1);
}
}
+ break;
case sizeof(double):
// matrix multiply
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M_, N_, K_,
(double*)(top_data) + (N_ * i), 1);
}
}
+ break;
default:
CHECK(false) << "Unknown data type.";
}
CUBLAS_CHECK(cublasSgemm(Caffeine::cublas_handle(), CUBLAS_OP_N,
CUBLAS_OP_N, N_, M_, K_, (float*)&alpha, (const float*)weight, N_,
(const float*)bottom_data, K_, (float*)&beta, (float*)top_data, N_));
+ break;
case sizeof(double):
// matrix multiply
CUBLAS_CHECK(cublasDgemm(Caffeine::cublas_handle(), CUBLAS_OP_N,
CUBLAS_OP_N, N_, M_, K_, (double*)&alpha, (const double*)weight, N_,
(const double*)bottom_data, K_, (double*)&beta, (double*)top_data, N_));
+ break;
default:
CHECK(false) << "Unknown data type.";
}
using namespace std;
+cudaDeviceProp CAFFEINE_TEST_CUDA_PROP;
+
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
::google::InitGoogleLogging(argv[0]);
cout << "Cuda number of devices: " << device << endl;
cudaGetDevice(&device);
cout << "Current device id: " << device << endl;
- cudaDeviceProp prop;
- cudaGetDeviceProperties(&prop, device);
- printf("Major revision number: %d\n", prop.major);
- printf("Minor revision number: %d\n", prop.minor);
- printf("Name: %s\n", prop.name);
- printf("Total global memory: %u\n", prop.totalGlobalMem);
- printf("Total shared memory per block: %u\n", prop.sharedMemPerBlock);
- printf("Total registers per block: %d\n", prop.regsPerBlock);
- printf("Warp size: %d\n", prop.warpSize);
- printf("Maximum memory pitch: %u\n", prop.memPitch);
- printf("Maximum threads per block: %d\n", prop.maxThreadsPerBlock);
+ cudaGetDeviceProperties(&CAFFEINE_TEST_CUDA_PROP, device);
+ printf("Major revision number: %d\n", CAFFEINE_TEST_CUDA_PROP.major);
+ printf("Minor revision number: %d\n", CAFFEINE_TEST_CUDA_PROP.minor);
+ printf("Name: %s\n", CAFFEINE_TEST_CUDA_PROP.name);
+ printf("Total global memory: %u\n", CAFFEINE_TEST_CUDA_PROP.totalGlobalMem);
+ printf("Total shared memory per block: %u\n", CAFFEINE_TEST_CUDA_PROP.sharedMemPerBlock);
+ printf("Total registers per block: %d\n", CAFFEINE_TEST_CUDA_PROP.regsPerBlock);
+ printf("Warp size: %d\n", CAFFEINE_TEST_CUDA_PROP.warpSize);
+ printf("Maximum memory pitch: %u\n", CAFFEINE_TEST_CUDA_PROP.memPitch);
+ printf("Maximum threads per block: %d\n", CAFFEINE_TEST_CUDA_PROP.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
- printf("Maximum dimension %d of block: %d\n", i, prop.maxThreadsDim[i]);
+ printf("Maximum dimension %d of block: %d\n", i, CAFFEINE_TEST_CUDA_PROP.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
- printf("Maximum dimension %d of grid: %d\n", i, prop.maxGridSize[i]);
- printf("Clock rate: %d\n", prop.clockRate);
- printf("Total constant memory: %u\n", prop.totalConstMem);
- printf("Texture alignment: %u\n", prop.textureAlignment);
- printf("Concurrent copy and execution: %s\n", (prop.deviceOverlap ? "Yes" : "No"));
- printf("Number of multiprocessors: %d\n", prop.multiProcessorCount);
- printf("Kernel execution timeout: %s\n", (prop.kernelExecTimeoutEnabled ? "Yes" : "No"));
+ printf("Maximum dimension %d of grid: %d\n", i, CAFFEINE_TEST_CUDA_PROP.maxGridSize[i]);
+ printf("Clock rate: %d\n", CAFFEINE_TEST_CUDA_PROP.clockRate);
+ printf("Total constant memory: %u\n", CAFFEINE_TEST_CUDA_PROP.totalConstMem);
+ printf("Texture alignment: %u\n", CAFFEINE_TEST_CUDA_PROP.textureAlignment);
+ printf("Concurrent copy and execution: %s\n", (CAFFEINE_TEST_CUDA_PROP.deviceOverlap ? "Yes" : "No"));
+ printf("Number of multiprocessors: %d\n", CAFFEINE_TEST_CUDA_PROP.multiProcessorCount);
+ printf("Kernel execution timeout: %s\n", (CAFFEINE_TEST_CUDA_PROP.kernelExecTimeoutEnabled ? "Yes" : "No"));
return RUN_ALL_TESTS();
}