* @date 05 April 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Debadri Samaddar <s.debadri@samsung.com>
+ * @author Sungsik Kong <ss.kong@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is half-precision GEMM 1x8 kernel
*
#include <stdlib.h>
// 1. Partial sum 64 digits : worst accuracy, best latency
-#define KERNEL_1x8_ACC8() \
- v0 = vdupq_n_f16(0.F); \
- dv0 = *a; \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_n_f16(v0, v24, dv0); \
- dv1 = *(a + 1); \
- v25 = vld1q_f16(b + 8); \
- v0 = vfmaq_n_f16(v0, v25, dv1); \
- dv2 = *(a + 2); \
- v26 = vld1q_f16(b + 16); \
- v0 = vfmaq_n_f16(v0, v26, dv2); \
- dv3 = *(a + 3); \
- v27 = vld1q_f16(b + 24); \
- v0 = vfmaq_n_f16(v0, v27, dv3); \
- dv4 = *(a + 4); \
- v28 = vld1q_f16(b + 32); \
- v0 = vfmaq_n_f16(v0, v28, dv4); \
- dv5 = *(a + 5); \
- v29 = vld1q_f16(b + 40); \
- v0 = vfmaq_n_f16(v0, v29, dv5); \
- dv6 = *(a + 6); \
- v30 = vld1q_f16(b + 48); \
- v0 = vfmaq_n_f16(v0, v30, dv6); \
- dv7 = *(a + 7); \
- v31 = vld1q_f16(b + 56); \
- v0 = vfmaq_n_f16(v0, v31, dv7); \
- l += 8; \
- b += 8 * 8; \
- a += 8;
+#define KERNEL_1x8_ACC8() \
+ do { \
+ v0 = vdupq_n_f16(0.F); \
+ dv0 = *a; \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_n_f16(v0, v24, dv0); \
+ dv1 = *(a + 1); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_n_f16(v0, v25, dv1); \
+ dv2 = *(a + 2); \
+ v26 = vld1q_f16(b + 16); \
+ v0 = vfmaq_n_f16(v0, v26, dv2); \
+ dv3 = *(a + 3); \
+ v27 = vld1q_f16(b + 24); \
+ v0 = vfmaq_n_f16(v0, v27, dv3); \
+ dv4 = *(a + 4); \
+ v28 = vld1q_f16(b + 32); \
+ v0 = vfmaq_n_f16(v0, v28, dv4); \
+ dv5 = *(a + 5); \
+ v29 = vld1q_f16(b + 40); \
+ v0 = vfmaq_n_f16(v0, v29, dv5); \
+ dv6 = *(a + 6); \
+ v30 = vld1q_f16(b + 48); \
+ v0 = vfmaq_n_f16(v0, v30, dv6); \
+ dv7 = *(a + 7); \
+ v31 = vld1q_f16(b + 56); \
+ v0 = vfmaq_n_f16(v0, v31, dv7); \
+ l += 8; \
+ b += 8 * 8; \
+ a += 8; \
+ } while (0)
// 2. Partial sum 32 digits : medium accuracy, medium latency
-#define KERNEL_1x8_ACC4() \
- v0 = vdupq_n_f16(0.F); \
- dv0 = *a; \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_n_f16(v0, v24, dv0); \
- dv1 = *(a + 1); \
- v25 = vld1q_f16(b + 8); \
- v0 = vfmaq_n_f16(v0, v25, dv1); \
- dv2 = *(a + 2); \
- v26 = vld1q_f16(b + 16); \
- v0 = vfmaq_n_f16(v0, v26, dv2); \
- dv3 = *(a + 3); \
- v27 = vld1q_f16(b + 24); \
- v0 = vfmaq_n_f16(v0, v27, dv3); \
- l += 4; \
- b += 8 * 4; \
- a += 4;
+#define KERNEL_1x8_ACC4() \
+ do { \
+ v0 = vdupq_n_f16(0.F); \
+ dv0 = *a; \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_n_f16(v0, v24, dv0); \
+ dv1 = *(a + 1); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_n_f16(v0, v25, dv1); \
+ dv2 = *(a + 2); \
+ v26 = vld1q_f16(b + 16); \
+ v0 = vfmaq_n_f16(v0, v26, dv2); \
+ dv3 = *(a + 3); \
+ v27 = vld1q_f16(b + 24); \
+ v0 = vfmaq_n_f16(v0, v27, dv3); \
+ l += 4; \
+ b += 8 * 4; \
+ a += 4; \
+ } while (0)
// 3. Partial sum 8 digits : Best accuracy, worst latency
-#define KERNEL_1x8_ACC1() \
- v0 = vdupq_n_f16(0.F); \
- dv0 = *(a); \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_n_f16(v0, v24, dv0); \
- l += 1; \
- b += 8 * 1; \
- a++;
+#define KERNEL_1x8_ACC1() \
+ do { \
+ v0 = vdupq_n_f16(0.F); \
+ dv0 = *(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_n_f16(v0, v24, dv0); \
+ l += 1; \
+ b += 8 * 1; \
+ a++; \
+ } while (0)
/**
* @brief hgemm 1x8 kernel sc = sa * sb
#include <hgemm_common.h>
#include <stdlib.h>
-#define INIT_KERNEL_4x4() \
- v24 = vdup_n_f16(0.F); \
- v25 = vdup_n_f16(0.F); \
- v26 = vdup_n_f16(0.F); \
- v27 = vdup_n_f16(0.F);
+#define INIT_KERNEL_4x4() \
+ do { \
+ v24 = vdup_n_f16(0.F); \
+ v25 = vdup_n_f16(0.F); \
+ v26 = vdup_n_f16(0.F); \
+ v27 = vdup_n_f16(0.F); \
+ } while (0)
// 1. Partial sum 256 digits
-#define KERNEL_4x4_ACC16() \
- dv0 = vld1_f16(a); \
- vb0 = vld1_f16(b); \
- v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
- v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
- v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
- v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
- dv1 = vld1_f16(a + 4); \
- vb1 = vld1_f16(b + 4); \
- v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
- v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
- v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
- v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
- dv2 = vld1_f16(a + 4 * 2); \
- vb2 = vld1_f16(b + 4 * 2); \
- v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
- v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
- v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
- v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
- dv3 = vld1_f16(a + 4 * 3); \
- vb3 = vld1_f16(b + 4 * 3); \
- v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
- v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
- v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
- v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
- dv4 = vld1_f16(a + 4 * 4); \
- vb4 = vld1_f16(b + 4 * 4); \
- v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
- v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
- v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
- v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
- dv5 = vld1_f16(a + 4 * 5); \
- vb5 = vld1_f16(b + 4 * 5); \
- v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
- v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
- v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
- v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
- dv6 = vld1_f16(a + 4 * 6); \
- vb6 = vld1_f16(b + 4 * 6); \
- v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
- v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
- v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
- v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
- dv7 = vld1_f16(a + 4 * 7); \
- vb7 = vld1_f16(b + 4 * 7); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 8); \
- vb7 = vld1_f16(b + 4 * 8); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 9); \
- vb7 = vld1_f16(b + 4 * 9); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 10); \
- vb7 = vld1_f16(b + 4 * 10); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 11); \
- vb7 = vld1_f16(b + 4 * 11); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 12); \
- vb7 = vld1_f16(b + 4 * 12); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 13); \
- vb7 = vld1_f16(b + 4 * 13); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 14); \
- vb7 = vld1_f16(b + 4 * 14); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 15); \
- vb7 = vld1_f16(b + 4 * 15); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- l += 16; \
- __builtin_prefetch(b + 64, 0, 3); \
- __builtin_prefetch(a + 64, 0, 3); \
- b += 4 * 16; \
- a += 4 * 16;
+#define KERNEL_4x4_ACC16() \
+ do { \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ vb1 = vld1_f16(b + 4); \
+ v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+ v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+ v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+ v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+ dv2 = vld1_f16(a + 4 * 2); \
+ vb2 = vld1_f16(b + 4 * 2); \
+ v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+ v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+ v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+ v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+ dv3 = vld1_f16(a + 4 * 3); \
+ vb3 = vld1_f16(b + 4 * 3); \
+ v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+ v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+ v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+ v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+ dv4 = vld1_f16(a + 4 * 4); \
+ vb4 = vld1_f16(b + 4 * 4); \
+ v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+ v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+ v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+ v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+ dv5 = vld1_f16(a + 4 * 5); \
+ vb5 = vld1_f16(b + 4 * 5); \
+ v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+ v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+ v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+ v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+ dv6 = vld1_f16(a + 4 * 6); \
+ vb6 = vld1_f16(b + 4 * 6); \
+ v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+ v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+ v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+ v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+ dv7 = vld1_f16(a + 4 * 7); \
+ vb7 = vld1_f16(b + 4 * 7); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 8); \
+ vb7 = vld1_f16(b + 4 * 8); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 9); \
+ vb7 = vld1_f16(b + 4 * 9); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 10); \
+ vb7 = vld1_f16(b + 4 * 10); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 11); \
+ vb7 = vld1_f16(b + 4 * 11); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 12); \
+ vb7 = vld1_f16(b + 4 * 12); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 13); \
+ vb7 = vld1_f16(b + 4 * 13); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 14); \
+ vb7 = vld1_f16(b + 4 * 14); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 15); \
+ vb7 = vld1_f16(b + 4 * 15); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ l += 16; \
+ __builtin_prefetch(b + 64, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ b += 4 * 16; \
+ a += 4 * 16; \
+ } while (0)
// 2. Partial sum 128 digits
-#define KERNEL_4x4_ACC8() \
- dv0 = vld1_f16(a); \
- vb0 = vld1_f16(b); \
- v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
- v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
- v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
- v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
- dv1 = vld1_f16(a + 4); \
- vb1 = vld1_f16(b + 4); \
- v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
- v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
- v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
- v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
- dv2 = vld1_f16(a + 8); \
- vb2 = vld1_f16(b + 8); \
- v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
- v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
- v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
- v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
- dv3 = vld1_f16(a + 12); \
- vb3 = vld1_f16(b + 12); \
- v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
- v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
- v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
- v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
- dv4 = vld1_f16(a + 16); \
- vb4 = vld1_f16(b + 16); \
- v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
- v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
- v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
- v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
- dv5 = vld1_f16(a + 20); \
- vb5 = vld1_f16(b + 20); \
- v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
- v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
- v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
- v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
- dv6 = vld1_f16(a + 24); \
- vb6 = vld1_f16(b + 24); \
- v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
- v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
- v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
- v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
- dv7 = vld1_f16(a + 28); \
- vb7 = vld1_f16(b + 28); \
- v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
- v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
- v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
- v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
- l += 8; \
- __builtin_prefetch(b + 32, 0, 3); \
- __builtin_prefetch(a + 32, 0, 3); \
- b += 4 * 8; \
- a += 4 * 8;
+#define KERNEL_4x4_ACC8() \
+ do { \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ vb1 = vld1_f16(b + 4); \
+ v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+ v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+ v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+ v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+ dv2 = vld1_f16(a + 8); \
+ vb2 = vld1_f16(b + 8); \
+ v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+ v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+ v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+ v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+ dv3 = vld1_f16(a + 12); \
+ vb3 = vld1_f16(b + 12); \
+ v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+ v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+ v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+ v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+ dv4 = vld1_f16(a + 16); \
+ vb4 = vld1_f16(b + 16); \
+ v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+ v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+ v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+ v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+ dv5 = vld1_f16(a + 20); \
+ vb5 = vld1_f16(b + 20); \
+ v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+ v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+ v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+ v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+ dv6 = vld1_f16(a + 24); \
+ vb6 = vld1_f16(b + 24); \
+ v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+ v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+ v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+ v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+ dv7 = vld1_f16(a + 28); \
+ vb7 = vld1_f16(b + 28); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ l += 8; \
+ __builtin_prefetch(b + 32, 0, 3); \
+ __builtin_prefetch(a + 32, 0, 3); \
+ b += 4 * 8; \
+ a += 4 * 8; \
+ } while (0)
-// 2. Partial sum 16 digits
-#define KERNEL_4x4_ACC1() \
- dv0 = vld1_f16(a); \
- vb0 = vld1_f16(b); \
- v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
- v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
- v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
- v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
- l += 1; \
- __builtin_prefetch(b + 4, 0, 3); \
- __builtin_prefetch(a + 4, 0, 3); \
- b += 4 * 1; \
- a += 4 * 1;
+// 3. Partial sum 16 digits
+#define KERNEL_4x4_ACC1() \
+ do { \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ l += 1; \
+ __builtin_prefetch(b + 4, 0, 3); \
+ __builtin_prefetch(a + 4, 0, 3); \
+ b += 4 * 1; \
+ a += 4 * 1; \
+ } while (0)
-#define SAVE_KERNEL_4X4_F16_F32() \
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(v24))); \
- vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(v25))); \
- vst1q_f32(c + 2 * ldc, \
- vaddq_f32(vld1q_f32(c + 2 * ldc), vcvt_f32_f16(v26))); \
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27)));
+#define SAVE_KERNEL_4X4_F16_F32() \
+ do { \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(v24))); \
+ vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(v25))); \
+ vst1q_f32(c + 2 * ldc, \
+ vaddq_f32(vld1q_f32(c + 2 * ldc), vcvt_f32_f16(v26))); \
+ vst1q_f32(c + 3 * ldc, \
+ vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27))); \
+ } while (0)
/**
* @brief hgemm 4x4 kernel sc = sa * sb
#include <hgemm_common.h>
#include <stdlib.h>
-#define INIT_KERNEL_4X8() \
- v0 = vdupq_n_f16(0.F); \
- v3 = vdupq_n_f16(0.F); \
- v6 = vdupq_n_f16(0.F); \
- v9 = vdupq_n_f16(0.F);
+#define INIT_KERNEL_4X8() \
+ do { \
+ v0 = vdupq_n_f16(0.F); \
+ v3 = vdupq_n_f16(0.F); \
+ v6 = vdupq_n_f16(0.F); \
+ v9 = vdupq_n_f16(0.F); \
+ } while (0)
// 1. Partial sum 256 digits
-#define KERNEL_4x8_ACC16() \
- dv0 = vld1_f16(a); \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
- v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
- v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
- v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
- dv1 = vld1_f16(a + 4); \
- v25 = vld1q_f16(b + 8); \
- v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
- v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
- v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
- v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
- dv2 = vld1_f16(a + 4 * 2); \
- v26 = vld1q_f16(b + 8 * 2); \
- v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
- v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
- v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
- v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
- dv3 = vld1_f16(a + 4 * 3); \
- v27 = vld1q_f16(b + 8 * 3); \
- v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
- v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
- v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
- v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
- dv4 = vld1_f16(a + 4 * 4); \
- v28 = vld1q_f16(b + 8 * 4); \
- v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
- v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
- v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
- v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
- dv5 = vld1_f16(a + 4 * 5); \
- v29 = vld1q_f16(b + 8 * 5); \
- v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
- v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
- v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
- v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
- dv6 = vld1_f16(a + 4 * 6); \
- v30 = vld1q_f16(b + 8 * 6); \
- v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
- v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
- v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
- v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
- dv7 = vld1_f16(a + 4 * 7); \
- v31 = vld1q_f16(b + 8 * 7); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 8); \
- v31 = vld1q_f16(b + 8 * 8); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 9); \
- v31 = vld1q_f16(b + 8 * 9); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 10); \
- v31 = vld1q_f16(b + 8 * 10); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 11); \
- v31 = vld1q_f16(b + 8 * 11); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 12); \
- v31 = vld1q_f16(b + 8 * 12); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 13); \
- v31 = vld1q_f16(b + 8 * 13); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 14); \
- v31 = vld1q_f16(b + 8 * 14); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- dv7 = vld1_f16(a + 4 * 15); \
- v31 = vld1q_f16(b + 8 * 15); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- l += 16; \
- __builtin_prefetch(b + 128, 0, 3); \
- __builtin_prefetch(a + 64, 0, 3); \
- b += 8 * 16; \
- a += 4 * 16;
+#define KERNEL_4x8_ACC16() \
+ do { \
+ dv0 = vld1_f16(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+ v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+ v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+ v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
+ v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
+ v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
+ v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
+ dv2 = vld1_f16(a + 4 * 2); \
+ v26 = vld1q_f16(b + 8 * 2); \
+ v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
+ v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
+ v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
+ v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
+ dv3 = vld1_f16(a + 4 * 3); \
+ v27 = vld1q_f16(b + 8 * 3); \
+ v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
+ v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
+ v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
+ v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
+ dv4 = vld1_f16(a + 4 * 4); \
+ v28 = vld1q_f16(b + 8 * 4); \
+ v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
+ v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
+ v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
+ v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
+ dv5 = vld1_f16(a + 4 * 5); \
+ v29 = vld1q_f16(b + 8 * 5); \
+ v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
+ v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
+ v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
+ v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
+ dv6 = vld1_f16(a + 4 * 6); \
+ v30 = vld1q_f16(b + 8 * 6); \
+ v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
+ v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
+ v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
+ v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
+ dv7 = vld1_f16(a + 4 * 7); \
+ v31 = vld1q_f16(b + 8 * 7); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 8); \
+ v31 = vld1q_f16(b + 8 * 8); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 9); \
+ v31 = vld1q_f16(b + 8 * 9); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 10); \
+ v31 = vld1q_f16(b + 8 * 10); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 11); \
+ v31 = vld1q_f16(b + 8 * 11); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 12); \
+ v31 = vld1q_f16(b + 8 * 12); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 13); \
+ v31 = vld1q_f16(b + 8 * 13); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 14); \
+ v31 = vld1q_f16(b + 8 * 14); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 15); \
+ v31 = vld1q_f16(b + 8 * 15); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ l += 16; \
+ __builtin_prefetch(b + 128, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ b += 8 * 16; \
+ a += 4 * 16; \
+ } while (0)
// 1. Partial sum 256 digits
-#define KERNEL_4x8_ACC8() \
- dv0 = vld1_f16(a); \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
- v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
- v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
- v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
- dv1 = vld1_f16(a + 4); \
- v25 = vld1q_f16(b + 8); \
- v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
- v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
- v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
- v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
- dv2 = vld1_f16(a + 8); \
- v26 = vld1q_f16(b + 16); \
- v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
- v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
- v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
- v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
- dv3 = vld1_f16(a + 12); \
- v27 = vld1q_f16(b + 24); \
- v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
- v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
- v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
- v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
- dv4 = vld1_f16(a + 16); \
- v28 = vld1q_f16(b + 32); \
- v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
- v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
- v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
- v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
- dv5 = vld1_f16(a + 20); \
- v29 = vld1q_f16(b + 40); \
- v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
- v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
- v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
- v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
- dv6 = vld1_f16(a + 24); \
- v30 = vld1q_f16(b + 48); \
- v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
- v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
- v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
- v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
- dv7 = vld1_f16(a + 28); \
- v31 = vld1q_f16(b + 56); \
- v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
- v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
- v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
- v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
- l += 8; \
- __builtin_prefetch(b + 64, 0, 3); \
- __builtin_prefetch(a + 32, 0, 3); \
- b += 8 * 8; \
- a += 4 * 8;
+#define KERNEL_4x8_ACC8() \
+ do { \
+ dv0 = vld1_f16(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+ v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+ v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+ v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
+ v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
+ v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
+ v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
+ dv2 = vld1_f16(a + 8); \
+ v26 = vld1q_f16(b + 16); \
+ v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
+ v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
+ v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
+ v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
+ dv3 = vld1_f16(a + 12); \
+ v27 = vld1q_f16(b + 24); \
+ v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
+ v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
+ v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
+ v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
+ dv4 = vld1_f16(a + 16); \
+ v28 = vld1q_f16(b + 32); \
+ v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
+ v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
+ v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
+ v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
+ dv5 = vld1_f16(a + 20); \
+ v29 = vld1q_f16(b + 40); \
+ v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
+ v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
+ v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
+ v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
+ dv6 = vld1_f16(a + 24); \
+ v30 = vld1q_f16(b + 48); \
+ v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
+ v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
+ v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
+ v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
+ dv7 = vld1_f16(a + 28); \
+ v31 = vld1q_f16(b + 56); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ l += 8; \
+ __builtin_prefetch(b + 64, 0, 3); \
+ __builtin_prefetch(a + 32, 0, 3); \
+ b += 8 * 8; \
+ a += 4 * 8; \
+ } while (0)
// 2. Partial sum 128 digits
-#define KERNEL_4x8_ACC4() \
- dv0 = vld1_f16(a); \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
- v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
- v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
- v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
- dv1 = vld1_f16(a + 4); \
- v25 = vld1q_f16(b + 8); \
- v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
- v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
- v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
- v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
- dv2 = vld1_f16(a + 8); \
- v26 = vld1q_f16(b + 16); \
- v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
- v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
- v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
- v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
- dv3 = vld1_f16(a + 12); \
- v27 = vld1q_f16(b + 24); \
- v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
- v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
- v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
- v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
- l += 4; \
- __builtin_prefetch(b + 32, 0, 3); \
- __builtin_prefetch(a + 16, 0, 3); \
- b += 8 * 4; \
- a += 4 * 4;
+#define KERNEL_4x8_ACC4() \
+ do { \
+ dv0 = vld1_f16(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+ v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+ v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+ v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
+ v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
+ v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
+ v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
+ dv2 = vld1_f16(a + 8); \
+ v26 = vld1q_f16(b + 16); \
+ v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
+ v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
+ v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
+ v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
+ dv3 = vld1_f16(a + 12); \
+ v27 = vld1q_f16(b + 24); \
+ v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
+ v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
+ v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
+ v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
+ l += 4; \
+ __builtin_prefetch(b + 32, 0, 3); \
+ __builtin_prefetch(a + 16, 0, 3); \
+ b += 8 * 4; \
+ a += 4 * 4; \
+ } while (0)
// 3. Partial sum 32 digits
-#define KERNEL_4x8_ACC1() \
- dv0 = vld1_f16(a); \
- v24 = vld1q_f16(b); \
- v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
- v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
- v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
- v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
- l += 1; \
- __builtin_prefetch(b + 8, 0, 3); \
- __builtin_prefetch(a + 4, 0, 3); \
- b += 8 * 1; \
- a += 4 * 1;
+#define KERNEL_4x8_ACC1() \
+ do { \
+ dv0 = vld1_f16(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+ v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+ v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+ v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+ l += 1; \
+ __builtin_prefetch(b + 8, 0, 3); \
+ __builtin_prefetch(a + 4, 0, 3); \
+ b += 8 * 1; \
+ a += 4 * 1; \
+ } while (0)
-#define SAVE_KERNEL_4X8_F16_F32() \
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); \
- vst1q_f32(c + ldc, \
- vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v3)))); \
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
- vcvt_f32_f16(vget_low_f16(v6)))); \
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
- vcvt_f32_f16(vget_low_f16(v9)))); \
- \
- vst1q_f32(c + 4, \
- vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); \
- vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
- vcvt_f32_f16(vget_high_f16(v3)))); \
- vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
- vcvt_f32_f16(vget_high_f16(v6)))); \
- vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
- vcvt_f32_f16(vget_high_f16(v9))));
+#define SAVE_KERNEL_4X8_F16_F32() \
+ do { \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); \
+ vst1q_f32(c + ldc, \
+ vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v3)))); \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v6)))); \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v9)))); \
+ \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); \
+ vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
+ vcvt_f32_f16(vget_high_f16(v3)))); \
+ vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v6)))); \
+ vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v9)))); \
+ } while (0)
/**
* @brief hgemm 4x8 kernel sc = sa * sb
*/
#include <hgemm_common.h>
+#include <iostream>
#include <stdlib.h>
-#define INIT_KERNEL_8X16() \
- v0_7 = vdupq_n_f16(0.F); \
- v8_15 = vdupq_n_f16(0.F); \
- v16_23 = vdupq_n_f16(0.F); \
- v24_31 = vdupq_n_f16(0.F); \
- v32_39 = vdupq_n_f16(0.F); \
- v40_47 = vdupq_n_f16(0.F); \
- v48_55 = vdupq_n_f16(0.F); \
- v56_63 = vdupq_n_f16(0.F); \
- v64_71 = vdupq_n_f16(0.F); \
- v72_79 = vdupq_n_f16(0.F); \
- v80_87 = vdupq_n_f16(0.F); \
- v88_95 = vdupq_n_f16(0.F); \
- v96_103 = vdupq_n_f16(0.F); \
- v104_111 = vdupq_n_f16(0.F); \
- v112_119 = vdupq_n_f16(0.F); \
- v120_127 = vdupq_n_f16(0.F);
+#define INIT_KERNEL_8X16() \
+ do { \
+ v0_7 = vdupq_n_f16(0.F); \
+ v8_15 = vdupq_n_f16(0.F); \
+ v16_23 = vdupq_n_f16(0.F); \
+ v24_31 = vdupq_n_f16(0.F); \
+ v32_39 = vdupq_n_f16(0.F); \
+ v40_47 = vdupq_n_f16(0.F); \
+ v48_55 = vdupq_n_f16(0.F); \
+ v56_63 = vdupq_n_f16(0.F); \
+ v64_71 = vdupq_n_f16(0.F); \
+ v72_79 = vdupq_n_f16(0.F); \
+ v80_87 = vdupq_n_f16(0.F); \
+ v88_95 = vdupq_n_f16(0.F); \
+ v96_103 = vdupq_n_f16(0.F); \
+ v104_111 = vdupq_n_f16(0.F); \
+ v112_119 = vdupq_n_f16(0.F); \
+ v120_127 = vdupq_n_f16(0.F); \
+ } while (0)
// 1. Partial sum 2048 digits
-#define KERNEL_8x16_ACC16() \
- va0 = vld1q_f16(a); \
- v24 = vld1q_f16(b); \
- v25 = vld1q_f16(b + 8); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v26 = vld1q_f16(b + 8 * 2); \
- v27 = vld1q_f16(b + 8 * 3); \
- v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \
- va2 = vld1q_f16(a + 8 * 2); \
- v28 = vld1q_f16(b + 8 * 4); \
- v29 = vld1q_f16(b + 8 * 5); \
- v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \
- va3 = vld1q_f16(a + 8 * 3); \
- v30 = vld1q_f16(b + 8 * 6); \
- v31 = vld1q_f16(b + 8 * 7); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \
- va4 = vld1q_f16(a + 8 * 4); \
- v24 = vld1q_f16(b + 8 * 8); \
- v25 = vld1q_f16(b + 8 * 9); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va4, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va4, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va4, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va4, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va4, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va4, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va4, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va4, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va4, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va4, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va4, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va4, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va4, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va4, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va4, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va4, 7); \
- va5 = vld1q_f16(a + 8 * 5); \
- v26 = vld1q_f16(b + 8 * 10); \
- v27 = vld1q_f16(b + 8 * 11); \
- v0_7 = vfmaq_laneq_f16(v0_7, v26, va5, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v26, va5, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v26, va5, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v26, va5, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v26, va5, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v26, va5, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v26, va5, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v26, va5, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v27, va5, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v27, va5, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v27, va5, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v27, va5, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v27, va5, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v27, va5, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v27, va5, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v27, va5, 7); \
- va6 = vld1q_f16(a + 8 * 6); \
- v28 = vld1q_f16(b + 8 * 12); \
- v29 = vld1q_f16(b + 8 * 13); \
- v0_7 = vfmaq_laneq_f16(v0_7, v28, va6, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v28, va6, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v28, va6, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v28, va6, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v28, va6, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v28, va6, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v28, va6, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v28, va6, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v29, va6, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v29, va6, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v29, va6, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v29, va6, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v29, va6, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v29, va6, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v29, va6, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v29, va6, 7); \
- va7 = vld1q_f16(a + 8 * 7); \
- v30 = vld1q_f16(b + 8 * 14); \
- v31 = vld1q_f16(b + 8 * 15); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 8); \
- v30 = vld1q_f16(b + 8 * 16); \
- v31 = vld1q_f16(b + 8 * 17); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 9); \
- v30 = vld1q_f16(b + 8 * 18); \
- v31 = vld1q_f16(b + 8 * 19); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 10); \
- v30 = vld1q_f16(b + 8 * 20); \
- v31 = vld1q_f16(b + 8 * 21); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 11); \
- v30 = vld1q_f16(b + 8 * 22); \
- v31 = vld1q_f16(b + 8 * 23); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 12); \
- v30 = vld1q_f16(b + 8 * 24); \
- v31 = vld1q_f16(b + 8 * 25); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 13); \
- v30 = vld1q_f16(b + 8 * 26); \
- v31 = vld1q_f16(b + 8 * 27); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 14); \
- v30 = vld1q_f16(b + 8 * 28); \
- v31 = vld1q_f16(b + 8 * 29); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- va7 = vld1q_f16(a + 8 * 15); \
- v30 = vld1q_f16(b + 8 * 30); \
- v31 = vld1q_f16(b + 8 * 31); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- l += 16; \
- __builtin_prefetch(b + 256, 0, 3); \
- __builtin_prefetch(a + 128, 0, 3); \
- b += 16 * 16; \
- a += 8 * 16;
+#define KERNEL_8x16_ACC16() \
+ do { \
+ va0 = vld1q_f16(a + 8 * 0); \
+ vb1 = vld1q_f16(b + 8 * 0); \
+ vb2 = vld1q_f16(b + 8 * 1); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 1); \
+ vb1 = vld1q_f16(b + 8 * 2); \
+ vb2 = vld1q_f16(b + 8 * 3); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 2); \
+ vb1 = vld1q_f16(b + 8 * 4); \
+ vb2 = vld1q_f16(b + 8 * 5); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 3); \
+ vb1 = vld1q_f16(b + 8 * 6); \
+ vb2 = vld1q_f16(b + 8 * 7); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 4); \
+ vb1 = vld1q_f16(b + 8 * 8); \
+ vb2 = vld1q_f16(b + 8 * 9); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 5); \
+ vb1 = vld1q_f16(b + 8 * 10); \
+ vb2 = vld1q_f16(b + 8 * 11); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 6); \
+ vb1 = vld1q_f16(b + 8 * 12); \
+ vb2 = vld1q_f16(b + 8 * 13); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 7); \
+ vb1 = vld1q_f16(b + 8 * 14); \
+ vb2 = vld1q_f16(b + 8 * 15); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 8); \
+ vb1 = vld1q_f16(b + 8 * 16); \
+ vb2 = vld1q_f16(b + 8 * 17); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 9); \
+ vb1 = vld1q_f16(b + 8 * 18); \
+ vb2 = vld1q_f16(b + 8 * 19); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 10); \
+ vb1 = vld1q_f16(b + 8 * 20); \
+ vb2 = vld1q_f16(b + 8 * 21); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 11); \
+ vb1 = vld1q_f16(b + 8 * 22); \
+ vb2 = vld1q_f16(b + 8 * 23); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 12); \
+ vb1 = vld1q_f16(b + 8 * 24); \
+ vb2 = vld1q_f16(b + 8 * 25); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 13); \
+ vb1 = vld1q_f16(b + 8 * 26); \
+ vb2 = vld1q_f16(b + 8 * 27); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 14); \
+ vb1 = vld1q_f16(b + 8 * 28); \
+ vb2 = vld1q_f16(b + 8 * 29); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 15); \
+ vb1 = vld1q_f16(b + 8 * 30); \
+ vb2 = vld1q_f16(b + 8 * 31); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ __builtin_prefetch(b + 256, 0, 3); \
+ __builtin_prefetch(a + 128, 0, 3); \
+ l += 16; \
+ b += 16 * 16; \
+ a += 8 * 16; \
+ } while (0)
// 2. Partial sum 1024 digits
-#define KERNEL_8x16_ACC8() \
- va0 = vld1q_f16(a); \
- v24 = vld1q_f16(b); \
- v25 = vld1q_f16(b + 8); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v26 = vld1q_f16(b + 16); \
- v27 = vld1q_f16(b + 24); \
- v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \
- va2 = vld1q_f16(a + 16); \
- v28 = vld1q_f16(b + 32); \
- v29 = vld1q_f16(b + 40); \
- v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \
- va3 = vld1q_f16(a + 24); \
- v30 = vld1q_f16(b + 48); \
- v31 = vld1q_f16(b + 56); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \
- va4 = vld1q_f16(a + 32); \
- v24 = vld1q_f16(b + 64); \
- v25 = vld1q_f16(b + 72); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va4, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va4, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va4, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va4, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va4, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va4, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va4, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va4, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va4, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va4, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va4, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va4, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va4, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va4, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va4, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va4, 7); \
- va5 = vld1q_f16(a + 40); \
- v26 = vld1q_f16(b + 80); \
- v27 = vld1q_f16(b + 88); \
- v0_7 = vfmaq_laneq_f16(v0_7, v26, va5, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v26, va5, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v26, va5, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v26, va5, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v26, va5, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v26, va5, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v26, va5, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v26, va5, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v27, va5, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v27, va5, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v27, va5, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v27, va5, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v27, va5, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v27, va5, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v27, va5, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v27, va5, 7); \
- va6 = vld1q_f16(a + 48); \
- v28 = vld1q_f16(b + 96); \
- v29 = vld1q_f16(b + 104); \
- v0_7 = vfmaq_laneq_f16(v0_7, v28, va6, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v28, va6, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v28, va6, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v28, va6, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v28, va6, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v28, va6, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v28, va6, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v28, va6, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v29, va6, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v29, va6, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v29, va6, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v29, va6, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v29, va6, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v29, va6, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v29, va6, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v29, va6, 7); \
- va7 = vld1q_f16(a + 56); \
- v30 = vld1q_f16(b + 112); \
- v31 = vld1q_f16(b + 120); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
- l += 8; \
- __builtin_prefetch(b + 128, 0, 3); \
- __builtin_prefetch(a + 64, 0, 3); \
- b += 16 * 8; \
- a += 8 * 8;
+#define KERNEL_8x16_ACC8() \
+ do { \
+ va0 = vld1q_f16(a); \
+ vb1 = vld1q_f16(b); \
+ vb2 = vld1q_f16(b + 8); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8); \
+ vb1 = vld1q_f16(b + 16); \
+ vb2 = vld1q_f16(b + 24); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 16); \
+ vb1 = vld1q_f16(b + 32); \
+ vb2 = vld1q_f16(b + 40); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 24); \
+ vb1 = vld1q_f16(b + 48); \
+ vb2 = vld1q_f16(b + 56); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 32); \
+ vb1 = vld1q_f16(b + 64); \
+ vb2 = vld1q_f16(b + 72); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 40); \
+ vb1 = vld1q_f16(b + 80); \
+ vb2 = vld1q_f16(b + 88); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 48); \
+ vb1 = vld1q_f16(b + 96); \
+ vb2 = vld1q_f16(b + 104); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 56); \
+ vb1 = vld1q_f16(b + 112); \
+ vb2 = vld1q_f16(b + 120); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ l += 8; \
+ __builtin_prefetch(b + 128, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ b += 16 * 8; \
+ a += 8 * 8; \
+ } while (0)
// 3. Partial sum 512 digits
-#define KERNEL_8x16_ACC4() \
- va0 = vld1q_f16(a); \
- v24 = vld1q_f16(b); \
- v25 = vld1q_f16(b + 8); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v26 = vld1q_f16(b + 16); \
- v27 = vld1q_f16(b + 24); \
- v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \
- va2 = vld1q_f16(a + 16); \
- v28 = vld1q_f16(b + 32); \
- v29 = vld1q_f16(b + 40); \
- v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \
- va3 = vld1q_f16(a + 24); \
- v30 = vld1q_f16(b + 48); \
- v31 = vld1q_f16(b + 56); \
- v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \
- l += 4; \
- __builtin_prefetch(b + 64, 0, 3); \
- __builtin_prefetch(a + 32, 0, 3); \
- b += 16 * 4; \
- a += 8 * 4;
+#define KERNEL_8x16_ACC4() \
+ do { \
+ va0 = vld1q_f16(a); \
+ vb1 = vld1q_f16(b); \
+ vb2 = vld1q_f16(b + 8); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 8); \
+ vb1 = vld1q_f16(b + 16); \
+ vb2 = vld1q_f16(b + 24); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 16); \
+ vb1 = vld1q_f16(b + 32); \
+ vb2 = vld1q_f16(b + 40); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ va0 = vld1q_f16(a + 24); \
+ vb1 = vld1q_f16(b + 48); \
+ vb2 = vld1q_f16(b + 56); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ l += 4; \
+ __builtin_prefetch(b + 64, 0, 3); \
+ __builtin_prefetch(a + 32, 0, 3); \
+ b += 16 * 4; \
+ a += 8 * 4; \
+ } while (0)
-// 3. Partial sum 128 digits
-#define KERNEL_8x16_ACC1() \
- va0 = vld1q_f16(a); \
- v24 = vld1q_f16(b); \
- v25 = vld1q_f16(b + 8); \
- v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \
- v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \
- v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \
- v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \
- v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \
- v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \
- v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \
- v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \
- v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \
- v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \
- v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \
- v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \
- v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \
- v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
- v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
- v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
- l += 1; \
- __builtin_prefetch(b + 16, 0, 3); \
- __builtin_prefetch(a + 8, 0, 3); \
- b += 16 * 1; \
- a += 8 * 1;
+// 4. Partial sum 128 digits
+#define KERNEL_8x16_ACC1() \
+ do { \
+ va0 = vld1q_f16(a); \
+ vb1 = vld1q_f16(b); \
+ vb2 = vld1q_f16(b + 8); \
+ v0_7 = vfmaq_laneq_f16(v0_7, vb1, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, vb1, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, vb1, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, vb1, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, vb1, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, vb1, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, vb1, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, vb1, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, vb2, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, vb2, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, vb2, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, vb2, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, vb2, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, vb2, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, vb2, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, vb2, va0, 7); \
+ l += 1; \
+ __builtin_prefetch(b + 16, 0, 3); \
+ __builtin_prefetch(a + 8, 0, 3); \
+ b += 16 * 1; \
+ a += 8 * 1; \
+ } while (0)
#define SAVE_KERNEL_8X16_F16_F32() \
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7)))); \
- vst1q_f32(c + 4, \
- vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0_7)))); \
+ do { \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7)))); \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0_7)))); \
\
- vst1q_f32(c + 8, \
- vaddq_f32(vld1q_f32(c + 8), vcvt_f32_f16(vget_low_f16(v64_71)))); \
- vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v64_71)))); \
+ vst1q_f32( \
+ c + 8, vaddq_f32(vld1q_f32(c + 8), vcvt_f32_f16(vget_low_f16(v64_71)))); \
+ vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v64_71)))); \
\
- vst1q_f32(c + ldc, \
- vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v8_15)))); \
- vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v8_15)))); \
+ vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), \
+ vcvt_f32_f16(vget_low_f16(v8_15)))); \
+ vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v8_15)))); \
\
- vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v72_79)))); \
- vst1q_f32(c + ldc + 8 + 4, vaddq_f32(vld1q_f32(c + ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v72_79)))); \
+ vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v72_79)))); \
+ vst1q_f32(c + ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v72_79)))); \
\
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
- vcvt_f32_f16(vget_low_f16(v16_23)))); \
- vst1q_f32(c + 2 * ldc + 4, vaddq_f32(vld1q_f32(c + 2 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v16_23)))); \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v16_23)))); \
+ vst1q_f32(c + 2 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 2 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v16_23)))); \
\
- vst1q_f32(c + 2 * ldc + 8, vaddq_f32(vld1q_f32(c + 2 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v80_87)))); \
- vst1q_f32(c + 2 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v80_87)))); \
+ vst1q_f32(c + 2 * ldc + 8, vaddq_f32(vld1q_f32(c + 2 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v80_87)))); \
+ vst1q_f32(c + 2 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v80_87)))); \
\
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
- vcvt_f32_f16(vget_low_f16(v24_31)))); \
- vst1q_f32(c + 3 * ldc + 4, vaddq_f32(vld1q_f32(c + 3 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v24_31)))); \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v24_31)))); \
+ vst1q_f32(c + 3 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 3 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v24_31)))); \
\
- vst1q_f32(c + 3 * ldc + 8, vaddq_f32(vld1q_f32(c + 3 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v88_95)))); \
- vst1q_f32(c + 3 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v88_95)))); \
+ vst1q_f32(c + 3 * ldc + 8, vaddq_f32(vld1q_f32(c + 3 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v88_95)))); \
+ vst1q_f32(c + 3 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v88_95)))); \
\
- vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
- vcvt_f32_f16(vget_low_f16(v32_39)))); \
- vst1q_f32(c + 4 * ldc + 4, vaddq_f32(vld1q_f32(c + 4 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v32_39)))); \
+ vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v32_39)))); \
+ vst1q_f32(c + 4 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 4 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v32_39)))); \
\
- vst1q_f32(c + 4 * ldc + 8, vaddq_f32(vld1q_f32(c + 4 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v96_103)))); \
- vst1q_f32(c + 4 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v96_103)))); \
+ vst1q_f32(c + 4 * ldc + 8, \
+ vaddq_f32(vld1q_f32(c + 4 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v96_103)))); \
+ vst1q_f32(c + 4 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v96_103)))); \
\
- vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
- vcvt_f32_f16(vget_low_f16(v40_47)))); \
- vst1q_f32(c + 5 * ldc + 4, vaddq_f32(vld1q_f32(c + 5 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v40_47)))); \
- vst1q_f32(c + 5 * ldc + 8, vaddq_f32(vld1q_f32(c + 5 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v104_111)))); \
- vst1q_f32(c + 5 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v104_111)))); \
+ vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v40_47)))); \
+ vst1q_f32(c + 5 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 5 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v40_47)))); \
+ vst1q_f32(c + 5 * ldc + 8, \
+ vaddq_f32(vld1q_f32(c + 5 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v104_111)))); \
+ vst1q_f32(c + 5 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v104_111)))); \
\
- vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
- vcvt_f32_f16(vget_low_f16(v48_55)))); \
- vst1q_f32(c + 6 * ldc + 4, vaddq_f32(vld1q_f32(c + 6 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v48_55)))); \
+ vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v48_55)))); \
+ vst1q_f32(c + 6 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 6 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v48_55)))); \
\
- vst1q_f32(c + 6 * ldc + 8, vaddq_f32(vld1q_f32(c + 6 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v112_119)))); \
- vst1q_f32(c + 6 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v112_119)))); \
+ vst1q_f32(c + 6 * ldc + 8, \
+ vaddq_f32(vld1q_f32(c + 6 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v112_119)))); \
+ vst1q_f32(c + 6 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v112_119)))); \
\
- vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
- vcvt_f32_f16(vget_low_f16(v56_63)))); \
- vst1q_f32(c + 7 * ldc + 4, vaddq_f32(vld1q_f32(c + 7 * ldc + 4), \
- vcvt_f32_f16(vget_high_f16(v56_63)))); \
+ vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v56_63)))); \
+ vst1q_f32(c + 7 * ldc + 4, \
+ vaddq_f32(vld1q_f32(c + 7 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v56_63)))); \
\
- vst1q_f32(c + 7 * ldc + 8, vaddq_f32(vld1q_f32(c + 7 * ldc + 8), \
- vcvt_f32_f16(vget_low_f16(v120_127)))); \
- vst1q_f32(c + 7 * ldc + 8 + 4, \
- vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4), \
- vcvt_f32_f16(vget_high_f16(v120_127))));
+ vst1q_f32(c + 7 * ldc + 8, \
+ vaddq_f32(vld1q_f32(c + 7 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v120_127)))); \
+ vst1q_f32(c + 7 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v120_127)))); \
+ } while (0)
/**
* @brief hgemm 8x16 kernel sc = sa * sb
void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
- assert(M % 8 == 0 && N % 16 == 0);
+ assert(M % 8 == 0 && N % 16 == 0 && K % 8 == 0);
__fp16 *a = sa, *b = sb, *c = sc;
unsigned int i, j, l;
float16x8_t v80_87, v88_95;
float16x8_t v96_103, v104_111;
float16x8_t v112_119, v120_127;
-
- float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
- float16x8_t va0, va1, va2, va3;
+ float16x8_t vb1, vb2;
+ float16x8_t va0;
INIT_KERNEL_8X16();
l = 0;
void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
- assert(M % 8 == 0 && N % 16 == 0);
+ assert(M % 8 == 0 && N % 16 == 0 && K % 4 == 0);
+
+ // std::cout << " m : " << M << " , n : " << N << " , k : " << K << std::endl;
__fp16 *a = sa, *b = sb;
float *c = sc;
float16x8_t v80_87, v88_95;
float16x8_t v96_103, v104_111;
float16x8_t v112_119, v120_127;
- float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
- float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
+ float16x8_t vb1, vb2;
+ float16x8_t va0;
l = 0;
for (; l < K16;) {
INIT_KERNEL_8X16();
#include <hgemm_common.h>
#include <stdlib.h>
-#define INIT_KERNEL_8x8() \
- v24 = vdupq_n_f16(0.F); \
- v25 = vdupq_n_f16(0.F); \
- v26 = vdupq_n_f16(0.F); \
- v27 = vdupq_n_f16(0.F); \
- v28 = vdupq_n_f16(0.F); \
- v29 = vdupq_n_f16(0.F); \
- v30 = vdupq_n_f16(0.F); \
- v31 = vdupq_n_f16(0.F);
+#define INIT_KERNEL_8x8() \
+ do { \
+ v24 = vdupq_n_f16(0.F); \
+ v25 = vdupq_n_f16(0.F); \
+ v26 = vdupq_n_f16(0.F); \
+ v27 = vdupq_n_f16(0.F); \
+ v28 = vdupq_n_f16(0.F); \
+ v29 = vdupq_n_f16(0.F); \
+ v30 = vdupq_n_f16(0.F); \
+ v31 = vdupq_n_f16(0.F); \
+ } while (0)
// 1. Partial sum 1024 digits
-#define KERNEL_8x8_ACC16() \
- va0 = vld1q_f16(a); \
- v16 = vld1q_f16(b); \
- v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
- v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
- v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
- v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
- v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
- v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
- v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
- v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v17 = vld1q_f16(b + 8); \
- v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
- v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
- v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
- v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
- v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
- v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
- v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
- v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
- va2 = vld1q_f16(a + 8 * 2); \
- v18 = vld1q_f16(b + 8 * 2); \
- v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
- v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
- v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
- v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
- v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
- v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
- v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
- v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
- va3 = vld1q_f16(a + 8 * 3); \
- v19 = vld1q_f16(b + 8 * 3); \
- v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
- v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
- v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
- v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
- v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
- v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
- v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
- v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
- va4 = vld1q_f16(a + 8 * 4); \
- v20 = vld1q_f16(b + 8 * 4); \
- v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \
- v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \
- v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \
- v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \
- v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \
- v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \
- v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \
- v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \
- va5 = vld1q_f16(a + 8 * 5); \
- v21 = vld1q_f16(b + 8 * 5); \
- v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \
- v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \
- v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \
- v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \
- v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \
- v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \
- v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \
- v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \
- va6 = vld1q_f16(a + 8 * 6); \
- v22 = vld1q_f16(b + 8 * 6); \
- v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \
- v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \
- v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \
- v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \
- v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \
- v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \
- v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \
- v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \
- va7 = vld1q_f16(a + 8 * 7); \
- v23 = vld1q_f16(b + 8 * 7); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 8); \
- v23 = vld1q_f16(b + 8 * 8); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 9); \
- v23 = vld1q_f16(b + 8 * 9); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 10); \
- v23 = vld1q_f16(b + 8 * 10); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 11); \
- v23 = vld1q_f16(b + 8 * 11); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 12); \
- v23 = vld1q_f16(b + 8 * 12); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 13); \
- v23 = vld1q_f16(b + 8 * 13); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 14); \
- v23 = vld1q_f16(b + 8 * 14); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- va7 = vld1q_f16(a + 8 * 15); \
- v23 = vld1q_f16(b + 8 * 15); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- __builtin_prefetch(b + 128, 0, 3); \
- __builtin_prefetch(a + 128, 0, 3); \
- l += 16; \
- b += 8 * 16; \
- a += 8 * 16;
+#define KERNEL_8x8_ACC16() \
+ do { \
+ va0 = vld1q_f16(a); \
+ v16 = vld1q_f16(b); \
+ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+ va0 = vld1q_f16(a + 8); \
+ v17 = vld1q_f16(b + 8); \
+ v24 = vfmaq_laneq_f16(v24, v17, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v17, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v17, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v17, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v17, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v17, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v17, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v17, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 2); \
+ v18 = vld1q_f16(b + 8 * 2); \
+ v24 = vfmaq_laneq_f16(v24, v18, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v18, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v18, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v18, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v18, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v18, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v18, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v18, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 3); \
+ v19 = vld1q_f16(b + 8 * 3); \
+ v24 = vfmaq_laneq_f16(v24, v19, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v19, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v19, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v19, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v19, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v19, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v19, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v19, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 4); \
+ v20 = vld1q_f16(b + 8 * 4); \
+ v24 = vfmaq_laneq_f16(v24, v20, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v20, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v20, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v20, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v20, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v20, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v20, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v20, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 5); \
+ v21 = vld1q_f16(b + 8 * 5); \
+ v24 = vfmaq_laneq_f16(v24, v21, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v21, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v21, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v21, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v21, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v21, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v21, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v21, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 6); \
+ v22 = vld1q_f16(b + 8 * 6); \
+ v24 = vfmaq_laneq_f16(v24, v22, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v22, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v22, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v22, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v22, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v22, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v22, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v22, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 7); \
+ v23 = vld1q_f16(b + 8 * 7); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 8); \
+ v23 = vld1q_f16(b + 8 * 8); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 9); \
+ v23 = vld1q_f16(b + 8 * 9); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 10); \
+ v23 = vld1q_f16(b + 8 * 10); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 11); \
+ v23 = vld1q_f16(b + 8 * 11); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 12); \
+ v23 = vld1q_f16(b + 8 * 12); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 13); \
+ v23 = vld1q_f16(b + 8 * 13); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 14); \
+ v23 = vld1q_f16(b + 8 * 14); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ va0 = vld1q_f16(a + 8 * 15); \
+ v23 = vld1q_f16(b + 8 * 15); \
+ v24 = vfmaq_laneq_f16(v24, v23, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va0, 7); \
+ __builtin_prefetch(b + 128, 0, 3); \
+ __builtin_prefetch(a + 128, 0, 3); \
+ l += 16; \
+ b += 8 * 16; \
+ a += 8 * 16; \
+ } while (0)
// 2. Partial sum 512 digits
-#define KERNEL_8x8_ACC8() \
- va0 = vld1q_f16(a); \
- v16 = vld1q_f16(b); \
- v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
- v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
- v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
- v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
- v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
- v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
- v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
- v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v17 = vld1q_f16(b + 8); \
- v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
- v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
- v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
- v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
- v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
- v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
- v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
- v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
- va2 = vld1q_f16(a + 16); \
- v18 = vld1q_f16(b + 16); \
- v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
- v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
- v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
- v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
- v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
- v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
- v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
- v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
- va3 = vld1q_f16(a + 24); \
- v19 = vld1q_f16(b + 24); \
- v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
- v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
- v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
- v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
- v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
- v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
- v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
- v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
- va4 = vld1q_f16(a + 32); \
- v20 = vld1q_f16(b + 32); \
- v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \
- v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \
- v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \
- v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \
- v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \
- v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \
- v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \
- v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \
- va5 = vld1q_f16(a + 40); \
- v21 = vld1q_f16(b + 40); \
- v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \
- v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \
- v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \
- v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \
- v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \
- v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \
- v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \
- v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \
- va6 = vld1q_f16(a + 48); \
- v22 = vld1q_f16(b + 48); \
- v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \
- v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \
- v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \
- v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \
- v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \
- v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \
- v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \
- v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \
- va7 = vld1q_f16(a + 56); \
- v23 = vld1q_f16(b + 56); \
- v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
- v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
- v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
- v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
- v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
- v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
- v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
- v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
- __builtin_prefetch(b + 64, 0, 3); \
- __builtin_prefetch(a + 64, 0, 3); \
- l += 8; \
- b += 8 * 8; \
- a += 8 * 8;
+#define KERNEL_8x8_ACC8() \
+ do { \
+ va0 = vld1q_f16(a); \
+ v16 = vld1q_f16(b); \
+ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+ va1 = vld1q_f16(a + 8); \
+ v17 = vld1q_f16(b + 8); \
+ v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
+ v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
+ v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
+ v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
+ v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
+ v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
+ v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
+ v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
+ va2 = vld1q_f16(a + 16); \
+ v18 = vld1q_f16(b + 16); \
+ v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
+ v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
+ v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
+ v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
+ v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
+ v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
+ v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
+ v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
+ va3 = vld1q_f16(a + 24); \
+ v19 = vld1q_f16(b + 24); \
+ v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
+ v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
+ v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
+ v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
+ v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
+ v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
+ v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
+ v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
+ va4 = vld1q_f16(a + 32); \
+ v20 = vld1q_f16(b + 32); \
+ v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \
+ v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \
+ v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \
+ v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \
+ v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \
+ v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \
+ v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \
+ v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \
+ va5 = vld1q_f16(a + 40); \
+ v21 = vld1q_f16(b + 40); \
+ v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \
+ v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \
+ v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \
+ v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \
+ v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \
+ v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \
+ v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \
+ v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \
+ va6 = vld1q_f16(a + 48); \
+ v22 = vld1q_f16(b + 48); \
+ v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \
+ v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \
+ v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \
+ v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \
+ v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \
+ v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \
+ v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \
+ v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \
+ va7 = vld1q_f16(a + 56); \
+ v23 = vld1q_f16(b + 56); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ __builtin_prefetch(b + 64, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ l += 8; \
+ b += 8 * 8; \
+ a += 8 * 8; \
+ } while (0)
// 3. Partial sum 256 digits
-#define KERNEL_8x8_ACC4() \
- va0 = vld1q_f16(a); \
- v16 = vld1q_f16(b); \
- v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
- v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
- v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
- v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
- v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
- v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
- v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
- v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
- va1 = vld1q_f16(a + 8); \
- v17 = vld1q_f16(b + 8); \
- v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
- v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
- v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
- v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
- v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
- v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
- v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
- v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
- va2 = vld1q_f16(a + 16); \
- v18 = vld1q_f16(b + 16); \
- v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
- v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
- v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
- v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
- v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
- v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
- v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
- v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
- va3 = vld1q_f16(a + 24); \
- v19 = vld1q_f16(b + 24); \
- v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
- v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
- v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
- v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
- v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
- v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
- v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
- v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
- __builtin_prefetch(b + 32, 0, 3); \
- __builtin_prefetch(a + 32, 0, 3); \
- l += 4; \
- b += 8 * 4; \
- a += 8 * 4;
+#define KERNEL_8x8_ACC4() \
+ do { \
+ va0 = vld1q_f16(a); \
+ v16 = vld1q_f16(b); \
+ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+ va1 = vld1q_f16(a + 8); \
+ v17 = vld1q_f16(b + 8); \
+ v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
+ v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
+ v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
+ v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
+ v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
+ v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
+ v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
+ v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
+ va2 = vld1q_f16(a + 16); \
+ v18 = vld1q_f16(b + 16); \
+ v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
+ v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
+ v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
+ v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
+ v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
+ v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
+ v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
+ v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
+ va3 = vld1q_f16(a + 24); \
+ v19 = vld1q_f16(b + 24); \
+ v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
+ v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
+ v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
+ v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
+ v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
+ v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
+ v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
+ v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
+ __builtin_prefetch(b + 32, 0, 3); \
+ __builtin_prefetch(a + 32, 0, 3); \
+ l += 4; \
+ b += 8 * 4; \
+ a += 8 * 4; \
+ } while (0)
// 4. Partial sum 64 digits
-#define KERNEL_8x8_ACC1() \
- va0 = vld1q_f16(a); \
- v16 = vld1q_f16(b); \
- v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
- v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
- v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
- v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
- v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
- v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
- v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
- v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
- __builtin_prefetch(b + 8, 0, 3); \
- __builtin_prefetch(a + 8, 0, 3); \
- l += 1; \
- b += 8 * 1; \
- a += 8 * 1;
+#define KERNEL_8x8_ACC1() \
+ do { \
+ va0 = vld1q_f16(a); \
+ v16 = vld1q_f16(b); \
+ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+ __builtin_prefetch(b + 8, 0, 3); \
+ __builtin_prefetch(a + 8, 0, 3); \
+ l += 1; \
+ b += 8 * 1; \
+ a += 8 * 1; \
+ } while (0)
-#define SAVE_KERNEL_8X8_F16_f32() \
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24)))); \
- vst1q_f32(c + 4, \
- vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24)))); \
- \
- vst1q_f32(c + ldc, \
- vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v25)))); \
- vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
- vcvt_f32_f16(vget_high_f16(v25)))); \
- \
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
- vcvt_f32_f16(vget_low_f16(v26)))); \
- vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
- vcvt_f32_f16(vget_high_f16(v26)))); \
- \
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
- vcvt_f32_f16(vget_low_f16(v27)))); \
- vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
- vcvt_f32_f16(vget_high_f16(v27)))); \
- \
- vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
- vcvt_f32_f16(vget_low_f16(v28)))); \
- vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc), \
- vcvt_f32_f16(vget_high_f16(v28)))); \
- \
- vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
- vcvt_f32_f16(vget_low_f16(v29)))); \
- vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc), \
- vcvt_f32_f16(vget_high_f16(v29)))); \
- \
- vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
- vcvt_f32_f16(vget_low_f16(v30)))); \
- vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc), \
- vcvt_f32_f16(vget_high_f16(v30)))); \
- \
- vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
- vcvt_f32_f16(vget_low_f16(v31)))); \
- vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc), \
- vcvt_f32_f16(vget_high_f16(v31))));
+#define SAVE_KERNEL_8X8_F16_f32() \
+ do { \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24)))); \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24)))); \
+ \
+ vst1q_f32(c + ldc, \
+ vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v25)))); \
+ vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
+ vcvt_f32_f16(vget_high_f16(v25)))); \
+ \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v26)))); \
+ vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v26)))); \
+ \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v27)))); \
+ vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v27)))); \
+ \
+ vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v28)))); \
+ vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v28)))); \
+ \
+ vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v29)))); \
+ vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v29)))); \
+ \
+ vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v30)))); \
+ vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v30)))); \
+ \
+ vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v31)))); \
+ vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v31)))); \
+ } while (0)
/**
* @brief hgemm 8x8 kernel sc = sa * sb
void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *sa, __fp16 *sb, __fp16 *sc, unsigned int ldc) {
assert(M > 0 && N > 0 && K > 0);
- assert(M % 8 == 0 && N % 8 == 0 && K % 8 == 0);
+ assert(M % 8 == 0 && N % 8 == 0 && K % 4 == 0);
__fp16 *a = sa, *b = sb, *c = sc;
unsigned int i, j, l;