--- /dev/null
+/*********************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*
+ * This file is based on dgemm_kernel_4x8_haswell.s (original copyright above).
+ * The content got translated from ASM to C+intrinsics, significantly simplified,
+ * and AVX512 support added by Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+
+#include "common.h"
+#include <immintrin.h>
+
+
+/*******************************************************************************************
+* Macro definitions
+*******************************************************************************************/
+
+
+/******************************************************************************************/
+
+
+#define INIT4x8() \
+ ymm4 = _mm256_setzero_pd(); \
+ ymm5 = _mm256_setzero_pd(); \
+ ymm6 = _mm256_setzero_pd(); \
+ ymm7 = _mm256_setzero_pd(); \
+ ymm8 = _mm256_setzero_pd(); \
+ ymm9 = _mm256_setzero_pd(); \
+ ymm10 = _mm256_setzero_pd(); \
+ ymm11 = _mm256_setzero_pd(); \
+
+
+#define KERNEL4x8_SUB() \
+ ymm0 = _mm256_loadu_pd(AO - 16); \
+/* ymm0 [ A B C D ] */ \
+ ymm1 = _mm256_loadu_pd(BO - 12); \
+ ymm2 = _mm256_loadu_pd(BO - 8); \
+/* ymm1 [ 1 2 3 4 ] */ \
+/* ymm2 [ 5 6 7 8 ] */ \
+ \
+ ymm4 += ymm0 * ymm1; \
+/* ymm4 += [ A*1 | B*2 | C*3 | D*4 ] */ \
+ ymm8 += ymm0 * ymm2; \
+/* ymm8 += [ A*5 | B*6 | C*7 | D*8 ] */ \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
+/* ymm0 [ B A D C ] */ \
+ ymm5 += ymm0 * ymm1; \
+/* ymm5 += [ B*1 | A*2 | D*3 | C*4 ] */ \
+ ymm9 += ymm0 * ymm2; \
+/* ymm9 += [ B*5 | A*6 | D*7 | C*8 ] */ \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \
+/* ymm0 [ C D A B ]] */ \
+ ymm6 += ymm0 * ymm1; \
+/* ymm6 += [ C*1 | D*2 | A*3 | B*4 ] */ \
+ ymm10+= ymm0 * ymm2; \
+/* ymm10 += [ C*5 | D*6 | A*7 | B*8 ] */ \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
+/* ymm0 [ D C B A ] */ \
+ ymm7 += ymm0 * ymm1; \
+/* ymm7 += [ D*1 | C*2 | B*3 | A*4 ] */ \
+ ymm11+= ymm0 * ymm2; \
+/* ymm11 += [ D*5 | C*6 | B*7 | A*8 ] */ \
+ AO += 4; \
+ BO += 8;
+
+
+#define SAVE4x8(ALPHA) \
+ ymm0 = _mm256_set1_pd(ALPHA); \
+ ymm4 *= ymm0; \
+ ymm5 *= ymm0; \
+ ymm6 *= ymm0; \
+ ymm7 *= ymm0; \
+ ymm8 *= ymm0; \
+ ymm9 *= ymm0; \
+ ymm10 *= ymm0; \
+ ymm11 *= ymm0; \
+ \
+/* Entry values: */ \
+/* ymm4 = a [ A*1 | B*2 | C*3 | D*4 ] */ \
+/* ymm5 = a [ B*1 | A*2 | D*3 | C*4 ] */ \
+/* ymm6 = a [ C*1 | D*2 | A*3 | B*4 ] */ \
+/* ymm7 = a [ D*1 | C*2 | B*3 | A*4 ] */ \
+/* ymm8 = a [ A*5 | B*6 | C*7 | D*8 ] */ \
+/* ymm9 = a [ B*5 | A*6 | D*7 | C*8 ] */ \
+/* ymm10 = a [ C*5 | D*6 | A*7 | B*8 ] */ \
+/* ymm11 = a [ D*5 | C*6 | B*7 | A*8 ] */ \
+ \
+ ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \
+/* ymm5 = a [ A*2 | B*1 | C*4 | D*3 ] */ \
+ ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \
+/* ymm7 = a [ C*2 | D*1 | A*4 | B*3 ] */ \
+ \
+ ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \
+ ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \
+/* ymm0 = a [ A*1 | B*1 | C*3 | D*3 ] */ \
+/* ymm1 = a [ A*2 | B*2 | C*4 | D*4 ] */ \
+ ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \
+ ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \
+/* ymm2 = a [ C*1 | D*1 | A*3 | B*3 ] */ \
+/* ymm3 = a [ C*2 | D*2 | A*4 | B*4 ] */ \
+ \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
+/* ymm2 = a [ B*3 | A*3 | D*1 | C*1 ] */ \
+/* ymm3 = a [ B*4 | A*4 | D*2 | C*2 ] */ \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
+/* ymm2 = a [ A*3 | B*3 | C*1 | D*1 ] */ \
+/* ymm3 = a [ A*4 | B*4 | C*2 | D*2 ] */ \
+ \
+ ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
+ ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
+/* ymm4 = a [ A*1 | B*1 | C*1 | D*1 ] */ \
+/* ymm5 = a [ A*2 | B*2 | C*2 | D*2 ] */ \
+ ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
+ ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
+/* ymm5 = a [ A*3 | B*3 | C*3 | D*3 ] */ \
+/* ymm7 = a [ A*4 | B*4 | C*4 | D*4 ] */ \
+ \
+ ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \
+ ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \
+ ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \
+ ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \
+ _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \
+ _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \
+ _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \
+ _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \
+ \
+ ymm9 = _mm256_permute4x64_pd(ymm9, 0xb1); \
+ ymm11 = _mm256_permute4x64_pd(ymm11, 0xb1); \
+ \
+ ymm0 = _mm256_blend_pd(ymm8, ymm9, 0x0a); \
+ ymm1 = _mm256_blend_pd(ymm8, ymm9, 0x05); \
+ ymm2 = _mm256_blend_pd(ymm10, ymm11, 0x0a); \
+ ymm3 = _mm256_blend_pd(ymm10, ymm11, 0x05); \
+ \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
+ \
+ ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
+ ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
+ ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
+ ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
+ \
+ ymm4 += _mm256_loadu_pd(CO1 + (4 * ldc)); \
+ ymm5 += _mm256_loadu_pd(CO1 + (5 * ldc)); \
+ ymm6 += _mm256_loadu_pd(CO1 + (6 * ldc)); \
+ ymm7 += _mm256_loadu_pd(CO1 + (7 * ldc)); \
+ _mm256_storeu_pd(CO1 + (4 * ldc), ymm4); \
+ _mm256_storeu_pd(CO1 + (5 * ldc), ymm5); \
+ _mm256_storeu_pd(CO1 + (6 * ldc), ymm6); \
+ _mm256_storeu_pd(CO1 + (7 * ldc), ymm7); \
+ \
+ CO1 += 4;
+
+/******************************************************************************************/
+
+#define INIT2x8() \
+ xmm4 = _mm_setzero_pd(); \
+ xmm5 = _mm_setzero_pd(); \
+ xmm6 = _mm_setzero_pd(); \
+ xmm7 = _mm_setzero_pd(); \
+ xmm8 = _mm_setzero_pd(); \
+ xmm9 = _mm_setzero_pd(); \
+ xmm10 = _mm_setzero_pd(); \
+ xmm11 = _mm_setzero_pd(); \
+
+
+#define KERNEL2x8_SUB() \
+ xmm0 = _mm_loadu_pd(AO - 16); \
+ xmm1 = _mm_set1_pd(*(BO - 12)); \
+ xmm2 = _mm_set1_pd(*(BO - 11)); \
+ xmm3 = _mm_set1_pd(*(BO - 10)); \
+ xmm4 += xmm0 * xmm1; \
+ xmm1 = _mm_set1_pd(*(BO - 9)); \
+ xmm5 += xmm0 * xmm2; \
+ xmm2 = _mm_set1_pd(*(BO - 8)); \
+ xmm6 += xmm0 * xmm3; \
+ xmm3 = _mm_set1_pd(*(BO - 7)); \
+ xmm7 += xmm0 * xmm1; \
+ xmm1 = _mm_set1_pd(*(BO - 6)); \
+ xmm8 += xmm0 * xmm2; \
+ xmm2 = _mm_set1_pd(*(BO - 5)); \
+ xmm9 += xmm0 * xmm3; \
+ xmm10 += xmm0 * xmm1; \
+ xmm11 += xmm0 * xmm2; \
+ BO += 8; \
+ AO += 2;
+
+#define SAVE2x8(ALPHA) \
+ xmm0 = _mm_set1_pd(ALPHA); \
+ xmm4 *= xmm0; \
+ xmm5 *= xmm0; \
+ xmm6 *= xmm0; \
+ xmm7 *= xmm0; \
+ xmm8 *= xmm0; \
+ xmm9 *= xmm0; \
+ xmm10 *= xmm0; \
+ xmm11 *= xmm0; \
+ \
+ xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \
+ xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \
+ xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \
+ xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \
+ \
+ _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \
+ _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \
+ _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \
+ _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \
+ \
+ xmm8 += _mm_loadu_pd(CO1 + (4 * ldc)); \
+ xmm9 += _mm_loadu_pd(CO1 + (5 * ldc)); \
+ xmm10+= _mm_loadu_pd(CO1 + (6 * ldc)); \
+ xmm11+= _mm_loadu_pd(CO1 + (7 * ldc)); \
+ _mm_storeu_pd(CO1 + (4 * ldc), xmm8); \
+ _mm_storeu_pd(CO1 + (5 * ldc), xmm9); \
+ _mm_storeu_pd(CO1 + (6 * ldc), xmm10); \
+ _mm_storeu_pd(CO1 + (7 * ldc), xmm11); \
+ CO1 += 2;
+
+
+
+
+/******************************************************************************************/
+
+#define INIT1x8() \
+ dbl4 = 0; \
+ dbl5 = 0; \
+ dbl6 = 0; \
+ dbl7 = 0; \
+ dbl8 = 0; \
+ dbl9 = 0; \
+ dbl10 = 0; \
+ dbl11 = 0;
+
+
+#define KERNEL1x8_SUB() \
+ dbl0 = *(AO - 16); \
+ dbl1 = *(BO - 12); \
+ dbl2 = *(BO - 11); \
+ dbl3 = *(BO - 10); \
+ dbl4 += dbl0 * dbl1; \
+ dbl1 = *(BO - 9); \
+ dbl5 += dbl0 * dbl2; \
+ dbl2 = *(BO - 8); \
+ dbl6 += dbl0 * dbl3; \
+ dbl3 = *(BO - 7); \
+ dbl7 += dbl0 * dbl1; \
+ dbl1 = *(BO - 6); \
+ dbl8 += dbl0 * dbl2; \
+ dbl2 = *(BO - 5); \
+ dbl9 += dbl0 * dbl3; \
+ dbl10 += dbl0 * dbl1; \
+ dbl11 += dbl0 * dbl2; \
+ BO += 8; \
+ AO += 1;
+
+
+#define SAVE1x8(ALPHA) \
+ dbl0 = ALPHA; \
+ dbl4 *= dbl0; \
+ dbl5 *= dbl0; \
+ dbl6 *= dbl0; \
+ dbl7 *= dbl0; \
+ dbl8 *= dbl0; \
+ dbl9 *= dbl0; \
+ dbl10 *= dbl0; \
+ dbl11 *= dbl0; \
+ \
+ dbl4 += *(CO1 + (0 * ldc)); \
+ dbl5 += *(CO1 + (1 * ldc)); \
+ dbl6 += *(CO1 + (2 * ldc)); \
+ dbl7 += *(CO1 + (3 * ldc)); \
+ *(CO1 + (0 * ldc)) = dbl4; \
+ *(CO1 + (1 * ldc)) = dbl5; \
+ *(CO1 + (2 * ldc)) = dbl6; \
+ *(CO1 + (3 * ldc)) = dbl7; \
+ \
+ dbl8 += *(CO1 + (4 * ldc)); \
+ dbl9 += *(CO1 + (5 * ldc)); \
+ dbl10 += *(CO1 + (6 * ldc)); \
+ dbl11 += *(CO1 + (7 * ldc)); \
+ *(CO1 + (4 * ldc)) = dbl8; \
+ *(CO1 + (5 * ldc)) = dbl9; \
+ *(CO1 + (6 * ldc)) = dbl10; \
+ *(CO1 + (7 * ldc)) = dbl11; \
+ \
+ CO1 += 1;
+
+
+
+
+
+
+/******************************************************************************************/
+
+#define INIT4x4() \
+ ymm4 = _mm256_setzero_pd(); \
+ ymm5 = _mm256_setzero_pd(); \
+ ymm6 = _mm256_setzero_pd(); \
+ ymm7 = _mm256_setzero_pd(); \
+
+
+#define KERNEL4x4_SUB() \
+ ymm0 = _mm256_loadu_pd(AO - 16); \
+ ymm1 = _mm256_loadu_pd(BO - 12); \
+ \
+ ymm4 += ymm0 * ymm1; \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
+ ymm5 += ymm0 * ymm1; \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0x1b); \
+ ymm6 += ymm0 * ymm1; \
+ \
+ ymm0 = _mm256_permute4x64_pd(ymm0, 0xb1); \
+ ymm7 += ymm0 * ymm1; \
+ AO += 4; \
+ BO += 4;
+
+
+#define SAVE4x4(ALPHA) \
+ ymm0 = _mm256_set1_pd(ALPHA); \
+ ymm4 *= ymm0; \
+ ymm5 *= ymm0; \
+ ymm6 *= ymm0; \
+ ymm7 *= ymm0; \
+ \
+ ymm5 = _mm256_permute4x64_pd(ymm5, 0xb1); \
+ ymm7 = _mm256_permute4x64_pd(ymm7, 0xb1); \
+ \
+ ymm0 = _mm256_blend_pd(ymm4, ymm5, 0x0a); \
+ ymm1 = _mm256_blend_pd(ymm4, ymm5, 0x05); \
+ ymm2 = _mm256_blend_pd(ymm6, ymm7, 0x0a); \
+ ymm3 = _mm256_blend_pd(ymm6, ymm7, 0x05); \
+ \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0x1b); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0x1b); \
+ ymm2 = _mm256_permute4x64_pd(ymm2, 0xb1); \
+ ymm3 = _mm256_permute4x64_pd(ymm3, 0xb1); \
+ \
+ ymm4 = _mm256_blend_pd(ymm2, ymm0, 0x03); \
+ ymm5 = _mm256_blend_pd(ymm3, ymm1, 0x03); \
+ ymm6 = _mm256_blend_pd(ymm0, ymm2, 0x03); \
+ ymm7 = _mm256_blend_pd(ymm1, ymm3, 0x03); \
+ \
+ ymm4 += _mm256_loadu_pd(CO1 + (0 * ldc)); \
+ ymm5 += _mm256_loadu_pd(CO1 + (1 * ldc)); \
+ ymm6 += _mm256_loadu_pd(CO1 + (2 * ldc)); \
+ ymm7 += _mm256_loadu_pd(CO1 + (3 * ldc)); \
+ _mm256_storeu_pd(CO1 + (0 * ldc), ymm4); \
+ _mm256_storeu_pd(CO1 + (1 * ldc), ymm5); \
+ _mm256_storeu_pd(CO1 + (2 * ldc), ymm6); \
+ _mm256_storeu_pd(CO1 + (3 * ldc), ymm7); \
+ \
+ CO1 += 4;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT2x4() \
+ xmm4 = _mm_setzero_pd(); \
+ xmm5 = _mm_setzero_pd(); \
+ xmm6 = _mm_setzero_pd(); \
+ xmm7 = _mm_setzero_pd(); \
+
+
+
+#define KERNEL2x4_SUB() \
+ xmm0 = _mm_loadu_pd(AO - 16); \
+ xmm1 = _mm_set1_pd(*(BO - 12)); \
+ xmm2 = _mm_set1_pd(*(BO - 11)); \
+ xmm3 = _mm_set1_pd(*(BO - 10)); \
+ xmm4 += xmm0 * xmm1; \
+ xmm1 = _mm_set1_pd(*(BO - 9)); \
+ xmm5 += xmm0 * xmm2; \
+ xmm6 += xmm0 * xmm3; \
+ xmm7 += xmm0 * xmm1; \
+ BO += 4; \
+ AO += 2;
+
+
+
+#define SAVE2x4(ALPHA) \
+ xmm0 = _mm_set1_pd(ALPHA); \
+ xmm4 *= xmm0; \
+ xmm5 *= xmm0; \
+ xmm6 *= xmm0; \
+ xmm7 *= xmm0; \
+ \
+ xmm4 += _mm_loadu_pd(CO1 + (0 * ldc)); \
+ xmm5 += _mm_loadu_pd(CO1 + (1 * ldc)); \
+ xmm6 += _mm_loadu_pd(CO1 + (2 * ldc)); \
+ xmm7 += _mm_loadu_pd(CO1 + (3 * ldc)); \
+ \
+ _mm_storeu_pd(CO1 + (0 * ldc), xmm4); \
+ _mm_storeu_pd(CO1 + (1 * ldc), xmm5); \
+ _mm_storeu_pd(CO1 + (2 * ldc), xmm6); \
+ _mm_storeu_pd(CO1 + (3 * ldc), xmm7); \
+ \
+ CO1 += 2;
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT1x4() \
+ dbl4 = 0; \
+ dbl5 = 0; \
+ dbl6 = 0; \
+ dbl7 = 0; \
+
+#define KERNEL1x4_SUB() \
+ dbl0 = *(AO - 16); \
+ dbl1 = *(BO - 12); \
+ dbl2 = *(BO - 11); \
+ dbl3 = *(BO - 10); \
+ dbl8 = *(BO - 9); \
+ \
+ dbl4 += dbl0 * dbl1; \
+ dbl5 += dbl0 * dbl2; \
+ dbl6 += dbl0 * dbl3; \
+ dbl7 += dbl0 * dbl8; \
+ BO += 4; \
+ AO += 1;
+
+
+#define SAVE1x4(ALPHA) \
+ dbl0 = ALPHA; \
+ dbl4 *= dbl0; \
+ dbl5 *= dbl0; \
+ dbl6 *= dbl0; \
+ dbl7 *= dbl0; \
+ \
+ dbl4 += *(CO1 + (0 * ldc)); \
+ dbl5 += *(CO1 + (1 * ldc)); \
+ dbl6 += *(CO1 + (2 * ldc)); \
+ dbl7 += *(CO1 + (3 * ldc)); \
+ *(CO1 + (0 * ldc)) = dbl4; \
+ *(CO1 + (1 * ldc)) = dbl5; \
+ *(CO1 + (2 * ldc)) = dbl6; \
+ *(CO1 + (3 * ldc)) = dbl7; \
+ \
+ \
+ CO1 += 1;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT8x4() \
+ ymm10 = _mm256_setzero_pd(); \
+ ymm11 = _mm256_setzero_pd(); \
+ ymm12 = _mm256_setzero_pd(); \
+ ymm13 = _mm256_setzero_pd(); \
+ ymm14 = _mm256_setzero_pd(); \
+ ymm15 = _mm256_setzero_pd(); \
+ ymm16 = _mm256_setzero_pd(); \
+ ymm17 = _mm256_setzero_pd(); \
+
+
+#define KERNEL8x4_SUB() \
+ ymm0 = _mm256_loadu_pd(AO - 16); \
+ ymm1 = _mm256_loadu_pd(AO - 12); \
+ ymm2 = _mm256_set1_pd(*(BO - 12)); \
+ ymm3 = _mm256_set1_pd(*(BO - 11)); \
+ ymm4 = _mm256_set1_pd(*(BO - 10)); \
+ ymm5 = _mm256_set1_pd(*(BO - 9)); \
+ ymm10 += ymm0 * ymm2; \
+ ymm11 += ymm1 * ymm2; \
+ ymm12 += ymm0 * ymm3; \
+ ymm13 += ymm1 * ymm3; \
+ ymm14 += ymm0 * ymm4; \
+ ymm15 += ymm1 * ymm4; \
+ ymm16 += ymm0 * ymm5; \
+ ymm17 += ymm1 * ymm5; \
+ BO += 4; \
+ AO += 8;
+
+
+
+#define SAVE8x4(ALPHA) \
+ ymm0 = _mm256_set1_pd(ALPHA); \
+ ymm10 *= ymm0; \
+ ymm11 *= ymm0; \
+ ymm12 *= ymm0; \
+ ymm13 *= ymm0; \
+ ymm14 *= ymm0; \
+ ymm15 *= ymm0; \
+ ymm16 *= ymm0; \
+ ymm17 *= ymm0; \
+ \
+ ymm10 += _mm256_loadu_pd(CO1); \
+ ymm11 += _mm256_loadu_pd(CO1 + 4); \
+ ymm12 += _mm256_loadu_pd(CO1 + (ldc)); \
+ ymm13 += _mm256_loadu_pd(CO1 + (ldc) + 4); \
+ ymm14 += _mm256_loadu_pd(CO1 + (ldc*2)); \
+ ymm15 += _mm256_loadu_pd(CO1 + (ldc*2) + 4); \
+ ymm16 += _mm256_loadu_pd(CO1 + (ldc*3)); \
+ ymm17 += _mm256_loadu_pd(CO1 + (ldc*3) + 4); \
+ \
+ _mm256_storeu_pd(CO1, ymm10); \
+ _mm256_storeu_pd(CO1 + 4, ymm11); \
+ _mm256_storeu_pd(CO1 + ldc, ymm12); \
+ _mm256_storeu_pd(CO1 + ldc + 4, ymm13); \
+ _mm256_storeu_pd(CO1 + ldc*2, ymm14); \
+ _mm256_storeu_pd(CO1 + ldc*2 + 4, ymm15); \
+ _mm256_storeu_pd(CO1 + ldc*3, ymm16); \
+ _mm256_storeu_pd(CO1 + ldc*3 + 4, ymm17); \
+ \
+ CO1 += 8;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+#define INIT8x2() \
+ ymm4 = _mm256_setzero_pd(); \
+ ymm5 = _mm256_setzero_pd(); \
+ ymm6 = _mm256_setzero_pd(); \
+ ymm7 = _mm256_setzero_pd(); \
+
+
+#define KERNEL8x2_SUB() \
+ ymm0 = _mm256_loadu_pd(AO - 16); \
+ ymm1 = _mm256_loadu_pd(AO - 12); \
+ ymm2 = _mm256_set1_pd(*(BO - 12)); \
+ ymm3 = _mm256_set1_pd(*(BO - 11)); \
+ ymm4 += ymm0 * ymm2; \
+ ymm5 += ymm1 * ymm2; \
+ ymm6 += ymm0 * ymm3; \
+ ymm7 += ymm1 * ymm3; \
+ BO += 2; \
+ AO += 8;
+
+
+
+#define SAVE8x2(ALPHA) \
+ ymm0 = _mm256_set1_pd(ALPHA); \
+ ymm4 *= ymm0; \
+ ymm5 *= ymm0; \
+ ymm6 *= ymm0; \
+ ymm7 *= ymm0; \
+ \
+ ymm4 += _mm256_loadu_pd(CO1); \
+ ymm5 += _mm256_loadu_pd(CO1 + 4); \
+ ymm6 += _mm256_loadu_pd(CO1 + (ldc)); \
+ ymm7 += _mm256_loadu_pd(CO1 + (ldc) + 4); \
+ \
+ _mm256_storeu_pd(CO1, ymm4); \
+ _mm256_storeu_pd(CO1 + 4, ymm5); \
+ _mm256_storeu_pd(CO1 + ldc, ymm6); \
+ _mm256_storeu_pd(CO1 + ldc + 4, ymm7); \
+ \
+ CO1 += 8;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+#define INIT4x2() \
+ xmm4 = _mm_setzero_pd(); \
+ xmm5 = _mm_setzero_pd(); \
+ xmm6 = _mm_setzero_pd(); \
+ xmm7 = _mm_setzero_pd(); \
+
+
+#define KERNEL4x2_SUB() \
+ xmm0 = _mm_loadu_pd(AO - 16); \
+ xmm1 = _mm_loadu_pd(AO - 14); \
+ xmm2 = _mm_set1_pd(*(BO - 12)); \
+ xmm3 = _mm_set1_pd(*(BO - 11)); \
+ xmm4 += xmm0 * xmm2; \
+ xmm5 += xmm1 * xmm2; \
+ xmm6 += xmm0 * xmm3; \
+ xmm7 += xmm1 * xmm3; \
+ BO += 2; \
+ AO += 4;
+
+
+
+#define SAVE4x2(ALPHA) \
+ xmm0 = _mm_set1_pd(ALPHA); \
+ xmm4 *= xmm0; \
+ xmm5 *= xmm0; \
+ xmm6 *= xmm0; \
+ xmm7 *= xmm0; \
+ \
+ xmm4 += _mm_loadu_pd(CO1); \
+ xmm5 += _mm_loadu_pd(CO1 + 2); \
+ xmm6 += _mm_loadu_pd(CO1 + (ldc)); \
+ xmm7 += _mm_loadu_pd(CO1 + (ldc) + 2); \
+ \
+ _mm_storeu_pd(CO1, xmm4); \
+ _mm_storeu_pd(CO1 + 2, xmm5); \
+ _mm_storeu_pd(CO1 + ldc, xmm6); \
+ _mm_storeu_pd(CO1 + ldc + 2, xmm7); \
+ \
+ CO1 += 4;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT2x2() \
+ xmm4 = _mm_setzero_pd(); \
+ xmm6 = _mm_setzero_pd(); \
+
+
+
+#define KERNEL2x2_SUB() \
+ xmm2 = _mm_set1_pd(*(BO - 12)); \
+ xmm0 = _mm_loadu_pd(AO - 16); \
+ xmm3 = _mm_set1_pd(*(BO - 11)); \
+ xmm4 += xmm0 * xmm2; \
+ xmm6 += xmm0 * xmm3; \
+ BO += 2; \
+ AO += 2;
+
+
+#define SAVE2x2(ALPHA) \
+ if (ALPHA != 1.0) { \
+ xmm0 = _mm_set1_pd(ALPHA); \
+ xmm4 *= xmm0; \
+ xmm6 *= xmm0; \
+ } \
+ \
+ xmm4 += _mm_loadu_pd(CO1); \
+ xmm6 += _mm_loadu_pd(CO1 + ldc); \
+ \
+ _mm_storeu_pd(CO1, xmm4); \
+ _mm_storeu_pd(CO1 + ldc, xmm6); \
+ \
+ CO1 += 2;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT1x2() \
+ dbl4 = 0; \
+ dbl5 = 0;
+
+
+#define KERNEL1x2_SUB() \
+ dbl0 = *(AO - 16); \
+ dbl1 = *(BO - 12); \
+ dbl2 = *(BO - 11); \
+ dbl4 += dbl0 * dbl1; \
+ dbl5 += dbl0 * dbl2; \
+ BO += 2; \
+ AO += 1;
+
+
+#define SAVE1x2(ALPHA) \
+ dbl0 = ALPHA; \
+ dbl4 *= dbl0; \
+ dbl5 *= dbl0; \
+ \
+ dbl4 += *(CO1 + (0 * ldc)); \
+ dbl5 += *(CO1 + (1 * ldc)); \
+ *(CO1 + (0 * ldc)) = dbl4; \
+ *(CO1 + (1 * ldc)) = dbl5; \
+ \
+ \
+ CO1 += 1;
+
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT4x1() \
+ ymm4 = _mm256_setzero_pd(); \
+ ymm5 = _mm256_setzero_pd(); \
+ ymm6 = _mm256_setzero_pd(); \
+ ymm7 = _mm256_setzero_pd();
+
+
+#define KERNEL4x1() \
+ ymm0 = _mm256_set1_pd(*(BO - 12)); \
+ ymm1 = _mm256_set1_pd(*(BO - 11)); \
+ ymm2 = _mm256_set1_pd(*(BO - 10)); \
+ ymm3 = _mm256_set1_pd(*(BO - 9)); \
+ \
+ ymm4 += _mm256_loadu_pd(AO - 16) * ymm0; \
+ ymm5 += _mm256_loadu_pd(AO - 12) * ymm1; \
+ \
+ ymm0 = _mm256_set1_pd(*(BO - 8)); \
+ ymm1 = _mm256_set1_pd(*(BO - 7)); \
+ \
+ ymm6 += _mm256_loadu_pd(AO - 8) * ymm2; \
+ ymm7 += _mm256_loadu_pd(AO - 4) * ymm3; \
+ \
+ ymm2 = _mm256_set1_pd(*(BO - 6)); \
+ ymm3 = _mm256_set1_pd(*(BO - 5)); \
+ \
+ ymm4 += _mm256_loadu_pd(AO + 0) * ymm0; \
+ ymm5 += _mm256_loadu_pd(AO + 4) * ymm1; \
+ ymm6 += _mm256_loadu_pd(AO + 8) * ymm2; \
+ ymm7 += _mm256_loadu_pd(AO + 12) * ymm3; \
+ \
+ BO += 8; \
+ AO += 32;
+
+
+#define INIT8x1() \
+ zmm4 = _mm512_setzero_pd(); \
+
+
+#define KERNEL8x1_SUB() \
+ zmm2 = _mm512_set1_pd(*(BO - 12)); \
+ zmm0 = _mm512_loadu_pd(AO - 16); \
+ zmm4 += zmm0 * zmm2; \
+ BO += 1; \
+ AO += 8;
+
+
+#define SAVE8x1(ALPHA) \
+ zmm0 = _mm512_set1_pd(ALPHA); \
+ zmm4 *= zmm0; \
+ \
+ zmm4 += _mm512_loadu_pd(CO1); \
+ _mm512_storeu_pd(CO1, zmm4); \
+ CO1 += 8;
+
+#define KERNEL4x1_SUB() \
+ ymm2 = _mm256_set1_pd(*(BO - 12)); \
+ ymm0 = _mm256_loadu_pd(AO - 16); \
+ ymm4 += ymm0 * ymm2; \
+ BO += 1; \
+ AO += 4;
+
+
+#define SAVE4x1(ALPHA) \
+ ymm0 = _mm256_set1_pd(ALPHA); \
+ ymm4 += ymm5; \
+ ymm6 += ymm7; \
+ ymm4 += ymm6; \
+ ymm4 *= ymm0; \
+ \
+ ymm4 += _mm256_loadu_pd(CO1); \
+ _mm256_storeu_pd(CO1, ymm4); \
+ CO1 += 4;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT2x1() \
+ xmm4 = _mm_setzero_pd();
+
+
+#define KERNEL2x1_SUB() \
+ xmm2 = _mm_set1_pd(*(BO - 12)); \
+ xmm0 = _mm_loadu_pd(AO - 16); \
+ xmm4 += xmm0 * xmm2; \
+ BO += 1; \
+ AO += 2;
+
+
+#define SAVE2x1(ALPHA) \
+ xmm0 = _mm_set1_pd(ALPHA); \
+ xmm4 *= xmm0; \
+ \
+ xmm4 += _mm_loadu_pd(CO1); \
+ \
+ _mm_storeu_pd(CO1, xmm4); \
+ \
+ CO1 += 2;
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+#define INIT1x1() \
+ dbl4 = 0;
+
+#define KERNEL1x1_SUB() \
+ dbl1 = *(BO - 12); \
+ dbl0 = *(AO - 16); \
+ dbl4 += dbl0 * dbl1; \
+ BO += 1; \
+ AO += 1;
+
+#define SAVE1x1(ALPHA) \
+ dbl0 = ALPHA; \
+ dbl4 *= dbl0; \
+ dbl4 += *CO1; \
+ *CO1 = dbl4; \
+ CO1 += 1;
+
+
+/*******************************************************************************************/
+
+/* START */
+
+
+int __attribute__ ((noinline))
+dgemm_kernel(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc)
+{
+ unsigned long M=m, N=n, K=k;
+
+
+ if (M == 0)
+ return 0;
+ if (N == 0)
+ return 0;
+ if (K == 0)
+ return 0;
+
+ while (N >= 8) {
+ double *CO1;
+ double *AO;
+ int i;
+
+ CO1 = C;
+ C += 8 * ldc;
+
+ AO = A + 16;
+
+ i = m;
+
+ while (i >= 8) {
+ double *BO;
+ int kloop = K;
+
+ BO = B + 12;
+ /*
+ * This is the inner loop for the hot hot path
+ * Written in inline asm because compilers like GCC 8 and earlier
+ * struggle with register allocation and are not good at using
+ * the AVX512 built in broadcast ability (1to8)
+ */
+ asm(
+ "vxorpd %%zmm1, %%zmm1, %%zmm1\n"
+ "vmovapd %%zmm1, %%zmm2\n"
+ "vmovapd %%zmm1, %%zmm3\n"
+ "vmovapd %%zmm1, %%zmm4\n"
+ "vmovapd %%zmm1, %%zmm5\n"
+ "vmovapd %%zmm1, %%zmm6\n"
+ "vmovapd %%zmm1, %%zmm7\n"
+ "vmovapd %%zmm1, %%zmm8\n"
+ "vbroadcastsd (%[alpha]), %%zmm9\n"
+ "jmp .label1\n"
+ ".align 32\n"
+ /* Inner math loop */
+ ".label1:\n"
+ "vmovupd -128(%[AO]),%%zmm0\n"
+ "vfmadd231pd -96(%[BO])%{1to8%}, %%zmm0, %%zmm1\n"
+ "vfmadd231pd -88(%[BO])%{1to8%}, %%zmm0, %%zmm2\n"
+ "vfmadd231pd -80(%[BO])%{1to8%}, %%zmm0, %%zmm3\n"
+ "vfmadd231pd -72(%[BO])%{1to8%}, %%zmm0, %%zmm4\n"
+ "vfmadd231pd -64(%[BO])%{1to8%}, %%zmm0, %%zmm5\n"
+ "vfmadd231pd -56(%[BO])%{1to8%}, %%zmm0, %%zmm6\n"
+ "vfmadd231pd -48(%[BO])%{1to8%}, %%zmm0, %%zmm7\n"
+ "vfmadd231pd -40(%[BO])%{1to8%}, %%zmm0, %%zmm8\n"
+ "add $64, %[AO]\n"
+ "add $64, %[BO]\n"
+ "subl $1, %[kloop]\n"
+ "jg .label1\n"
+ /* multiply the result by alpha */
+ "vmulpd %%zmm9, %%zmm1, %%zmm1\n"
+ "vmulpd %%zmm9, %%zmm2, %%zmm2\n"
+ "vmulpd %%zmm9, %%zmm3, %%zmm3\n"
+ "vmulpd %%zmm9, %%zmm4, %%zmm4\n"
+ "vmulpd %%zmm9, %%zmm5, %%zmm5\n"
+ "vmulpd %%zmm9, %%zmm6, %%zmm6\n"
+ "vmulpd %%zmm9, %%zmm7, %%zmm7\n"
+ "vmulpd %%zmm9, %%zmm8, %%zmm8\n"
+ /* And store additively in C */
+ "vaddpd (%[C0]), %%zmm1, %%zmm1\n"
+ "vaddpd (%[C1]), %%zmm2, %%zmm2\n"
+ "vaddpd (%[C2]), %%zmm3, %%zmm3\n"
+ "vaddpd (%[C3]), %%zmm4, %%zmm4\n"
+ "vaddpd (%[C4]), %%zmm5, %%zmm5\n"
+ "vaddpd (%[C5]), %%zmm6, %%zmm6\n"
+ "vaddpd (%[C6]), %%zmm7, %%zmm7\n"
+ "vaddpd (%[C7]), %%zmm8, %%zmm8\n"
+ "vmovupd %%zmm1, (%[C0])\n"
+ "vmovupd %%zmm2, (%[C1])\n"
+ "vmovupd %%zmm3, (%[C2])\n"
+ "vmovupd %%zmm4, (%[C3])\n"
+ "vmovupd %%zmm5, (%[C4])\n"
+ "vmovupd %%zmm6, (%[C5])\n"
+ "vmovupd %%zmm7, (%[C6])\n"
+ "vmovupd %%zmm8, (%[C7])\n"
+ "prefetchw 64(%[C0])\n"
+ "prefetchw 64(%[C1])\n"
+ "prefetchw 64(%[C2])\n"
+ "prefetchw 64(%[C3])\n"
+ "prefetchw 64(%[C4])\n"
+ "prefetchw 64(%[C5])\n"
+ "prefetchw 64(%[C6])\n"
+ "prefetchw 64(%[C7])\n"
+ :
+ [AO] "+r" (AO),
+ [BO] "+r" (BO),
+ [C0] "+r" (CO1),
+ [kloop] "+r" (kloop)
+ :
+ [alpha] "r" (&alpha),
+ [C1] "r" (CO1 + 1 * ldc),
+ [C2] "r" (CO1 + 2 * ldc),
+ [C3] "r" (CO1 + 3 * ldc),
+ [C4] "r" (CO1 + 4 * ldc),
+ [C5] "r" (CO1 + 5 * ldc),
+ [C6] "r" (CO1 + 6 * ldc),
+ [C7] "r" (CO1 + 7 * ldc)
+
+ : "memory", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9"
+ );
+ CO1 += 8;
+ i-= 8;
+ }
+
+
+
+ while (i >= 4) {
+ double *BO;
+ __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11;
+ int kloop = K;
+
+ BO = B + 12;
+ INIT4x8()
+
+ while (kloop > 0) {
+ KERNEL4x8_SUB()
+ kloop--;
+ }
+ SAVE4x8(alpha)
+ i-= 4;
+ }
+
+
+ while (i >= 2) {
+ double *BO;
+ __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+ int kloop = K;
+
+ BO = B + 12;
+ INIT2x8()
+
+ while (kloop > 0) {
+ KERNEL2x8_SUB()
+ kloop--;
+ }
+ SAVE2x8(alpha)
+ i -= 2;
+ }
+
+ while (i >= 1) {
+ double *BO;
+ double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8, dbl9, dbl10, dbl11;
+ int kloop = K;
+
+ BO = B + 12;
+ INIT1x8()
+
+ while (kloop > 0) {
+ KERNEL1x8_SUB()
+ kloop--;
+ }
+ SAVE1x8(alpha)
+ i -= 1;
+ }
+ B += K * 8;
+ N -= 8;
+ }
+
+ if (N == 0)
+ return 0;
+
+
+
+ // L8_0
+ while (N >= 4) {
+ double *CO1;
+ double *AO;
+ int i;
+ // L8_10
+ CO1 = C;
+ C += 4 * ldc;
+
+ AO = A + 16;
+
+ i = m;
+ while (i >= 8) {
+ double *BO;
+ // L8_11
+ __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm10, ymm11,ymm12,ymm13,ymm14,ymm15,ymm16,ymm17;
+ BO = B + 12;
+ int kloop = K;
+
+ INIT8x4()
+
+ while (kloop > 0) {
+ // L12_17
+ KERNEL8x4_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE8x4(alpha)
+
+ i -= 8;
+ }
+ while (i >= 4) {
+ // L8_11
+ double *BO;
+ __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+ BO = B + 12;
+ int kloop = K;
+
+ INIT4x4()
+ // L8_16
+ while (kloop > 0) {
+ // L12_17
+ KERNEL4x4_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE4x4(alpha)
+
+ i -= 4;
+ }
+
+/**************************************************************************
+* Rest of M
+***************************************************************************/
+
+ while (i >= 2) {
+ double *BO;
+ __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+ BO = B;
+ BO += 12;
+
+ INIT2x4()
+ int kloop = K;
+
+ while (kloop > 0) {
+ KERNEL2x4_SUB()
+ kloop--;
+ }
+ SAVE2x4(alpha)
+ i -= 2;
+ }
+ // L13_40
+ while (i >= 1) {
+ double *BO;
+ double dbl0, dbl1, dbl2, dbl3, dbl4, dbl5, dbl6, dbl7, dbl8;
+ int kloop = K;
+ BO = B + 12;
+ INIT1x4()
+
+ while (kloop > 0) {
+ KERNEL1x4_SUB()
+ kloop--;
+ }
+ SAVE1x4(alpha)
+ i -= 1;
+ }
+
+ B += K * 4;
+ N -= 4;
+ }
+
+/**************************************************************************************************/
+
+ // L8_0
+ while (N >= 2) {
+ double *CO1;
+ double *AO;
+ int i;
+ // L8_10
+ CO1 = C;
+ C += 2 * ldc;
+
+ AO = A + 16;
+
+ i = m;
+ while (i >= 8) {
+ double *BO;
+ __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+ // L8_11
+ BO = B + 12;
+ int kloop = K;
+
+ INIT8x2()
+
+ // L8_16
+ while (kloop > 0) {
+ // L12_17
+ KERNEL8x2_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE8x2(alpha)
+
+ i-=8;
+ }
+
+ while (i >= 4) {
+ double *BO;
+ __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+ // L8_11
+ BO = B + 12;
+ int kloop = K;
+
+ INIT4x2()
+
+ // L8_16
+ while (kloop > 0) {
+ // L12_17
+ KERNEL4x2_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE4x2(alpha)
+
+ i-=4;
+ }
+
+/**************************************************************************
+* Rest of M
+***************************************************************************/
+
+ while (i >= 2) {
+ double *BO;
+ __m128d xmm0, xmm2, xmm3, xmm4, xmm6;
+ int kloop = K;
+ BO = B + 12;
+
+ INIT2x2()
+
+ while (kloop > 0) {
+ KERNEL2x2_SUB()
+ kloop--;
+ }
+ SAVE2x2(alpha)
+ i -= 2;
+ }
+ // L13_40
+ while (i >= 1) {
+ double *BO;
+ double dbl0, dbl1, dbl2, dbl4, dbl5;
+ int kloop = K;
+ BO = B + 12;
+
+ INIT1x2()
+
+ while (kloop > 0) {
+ KERNEL1x2_SUB()
+ kloop--;
+ }
+ SAVE1x2(alpha)
+ i -= 1;
+ }
+
+ B += K * 2;
+ N -= 2;
+ }
+
+ // L8_0
+ while (N >= 1) {
+ // L8_10
+ double *CO1;
+ double *AO;
+ int i;
+
+ CO1 = C;
+ C += ldc;
+
+ AO = A + 16;
+
+ i = m;
+ while (i >= 8) {
+ double *BO;
+ __m512d zmm0, zmm2, zmm4;
+ // L8_11
+ BO = B + 12;
+ int kloop = K;
+
+ INIT8x1()
+ // L8_16
+ while (kloop > 0) {
+ // L12_17
+ KERNEL8x1_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE8x1(alpha)
+
+ i-= 8;
+ }
+ while (i >= 4) {
+ double *BO;
+ __m256d ymm0, ymm2, ymm4, ymm5, ymm6, ymm7;
+ // L8_11
+ BO = B + 12;
+ int kloop = K;
+
+ INIT4x1()
+ // L8_16
+ while (kloop > 0) {
+ // L12_17
+ KERNEL4x1_SUB()
+ kloop--;
+ }
+ // L8_19
+ SAVE4x1(alpha)
+
+ i-= 4;
+ }
+
+/**************************************************************************
+* Rest of M
+***************************************************************************/
+
+ while (i >= 2) {
+ double *BO;
+ __m128d xmm0, xmm2, xmm4;
+ int kloop = K;
+ BO = B;
+ BO += 12;
+
+ INIT2x1()
+
+ while (kloop > 0) {
+ KERNEL2x1_SUB()
+ kloop--;
+ }
+ SAVE2x1(alpha)
+ i -= 2;
+ }
+ // L13_40
+ while (i >= 1) {
+ double *BO;
+ double dbl0, dbl1, dbl4;
+ int kloop = K;
+
+ BO = B;
+ BO += 12;
+ INIT1x1()
+
+
+ while (kloop > 0) {
+ KERNEL1x1_SUB()
+ kloop--;
+ }
+ SAVE1x1(alpha)
+ i -= 1;
+ }
+
+ B += K * 1;
+ N -= 1;
+ }
+
+
+ return 0;
+}