hgemm_noTrans_8x8(M, N, K, A, K, B, N, C, N, alpha, beta);
} else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) {
hgemm_noTrans_4x8(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (N % 8 == 0) {
- hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
} else if (M % 4 == 0 && N % 4 == 0 && K % 4 == 0) {
hgemm_noTrans_4x4(M, N, K, A, K, B, N, C, N, alpha, beta);
+ } else if (N % 8 == 0) {
+ hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
} else if (N % 4 == 0) {
hgemm_noTrans_1x4(M, N, K, A, K, B, N, C, N, alpha, beta);
}
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}
+TEST(nntrainer_Tensor, dot_gemm_512_520_1032) {
+ /// @note GEMM : A X B = C
+ int batch = 1;
+ int channel = 1;
+ int height = 512;
+ int width = 520;
+
+ int height_b = 520;
+ int width_b = 1032;
+
+ bool transA = false;
+ bool transB = false;
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+ nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16);
+ nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16);
+
+ nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
+ nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32);
+
+ const float alpha = 1e-1;
+ const int MOD = 10;
+
+ GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) +
+ k * (width) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) +
+ j * (batch * height_b) + k * (width_b) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
+ j * (batch * height) + k * (width) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) +
+ j * (batch * height_b) + k * (width_b) + l + 1) %
+ MOD) *
+ alpha);
+
+ nntrainer::Tensor C = A.dot(B, transA, transB);
+
+ nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
+
+ float mseErrorNeon =
+ mse<__fp16>(C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+ double cosSimNeon = cosine_similarity<__fp16>(
+ C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+ const float epsilon = 1e-3 * width;
+
+ EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+ EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
+TEST(nntrainer_Tensor, dot_gemm_1001_1024_20000) {
+ /// @note GEMM : A X B = C
+ int batch = 1;
+ int channel = 1;
+ int height = 1001;
+ int width = 1024;
+
+ int height_b = 1024;
+ int width_b = 20000;
+
+ bool transA = false;
+ bool transB = false;
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+ nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16);
+ nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16);
+
+ nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
+ nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32);
+
+ const float alpha = 1e-1;
+ const int MOD = 10;
+
+ GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) +
+ k * (width) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) +
+ j * (batch * height_b) + k * (width_b) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
+ j * (batch * height) + k * (width) + l + 1) %
+ MOD) *
+ alpha);
+ GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) +
+ j * (batch * height_b) + k * (width_b) + l + 1) %
+ MOD) *
+ alpha);
+
+ nntrainer::Tensor C = A.dot(B, transA, transB);
+
+ nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
+
+ float mseErrorNeon =
+ mse<__fp16>(C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+ double cosSimNeon = cosine_similarity<__fp16>(
+ C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+ const float epsilon = 1e-3 * width;
+
+ EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+ EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
TEST(nntrainer_Tensor, dot_gemm_50_768_516) {
/// @note GEMM : A X B = C
int batch = 1;