unsigned int size = M * N;
unsigned int size8 = (size >> 3) << 3;
unsigned int size4 = (size >> 2) << 2;
- if (beta != 0.F) {
+ if (std::fpclassify(beta) != FP_ZERO) {
for (; idx < size8; idx += 8) {
float16x8_t c =
vmulq_n_f16(vld1q_f16(&C[idx]), static_cast<__fp16>(beta));
*
*/
+#include <cmath>
#include <hgemm.h>
#include <hgemm_kernel_1x4.h>
#include <hgemm_kernel_1x8.h>
#include <hgemm_kernel_8x8.h>
#include <hgemm_kernel_pack.h>
#include <hgemm_util.h>
+#include <limits>
#define HGEMM_KERNEL_1x4 hgemm_kernel_1x4
#define HGEMM_KERNEL_4x4 hgemm_kernel_4x4
void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
- if (alpha == 1.F) {
+ const float eps = std::numeric_limits<float>::epsilon();
+ if (std::abs(alpha - 1.F) < eps) {
// used bitwise operator instead of modulo for performance
// e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {