SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_power10.c
-DGEMMINCOPY = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY = dgemm_tcopy_16_power8.S
-DGEMMONCOPY = dgemm_ncopy_4_power8.S
-DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMINCOPY =
+DGEMMITCOPY =
+DGEMMONCOPY = dgemm_ncopy_8_power10.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#endif
)
{
- BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
off = -offset;
#endif
v4sf_t valpha = { alpha, alpha };
- N = n >> 2;
- for (i1 = 0; i1 < N; i1++)
+ for (i1 = 0; i1 < (n >> 3); i1++)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
CO = C;
- C += ldc << 2;
+ C += ldc << 3;
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
- i = m >> 4;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 3); j++)
{
- FLOAT *BO;
+ FLOAT *BO;
#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 4);
+ REFRESH_POINTERS (8, 8);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
BLASLONG l = 0;
- PREFETCH1 (CO, 0);
- PREFETCH1 (CO + ldc, 0);
- PREFETCH1 (CO + ldc + ldc, 0);
- PREFETCH1 (CO + ldc + ldc + ldc, 0);
- PREFETCH1 (CO, 128);
- PREFETCH1 (CO + ldc, 128);
- PREFETCH1 (CO + ldc + ldc, 128);
- PREFETCH1 (CO + ldc + ldc + ldc, 128);
- __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
vec_t *rowA = (vec_t *) & AO[0];
- __vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
+ __vector_pair rowB, rowB1;
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+ __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
+ __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
+ __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
for (l = 1; l < temp; l++)
{
- rowA = (vec_t *) & AO[l << 4];
- rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 3];
+ rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
}
SAVE_ACC (&acc0, 0);
- SAVE_ACC (&acc2, 4);
- SAVE_ACC (&acc1, 2);
- SAVE_ACC (&acc3, 6);
- SAVE_ACC (&acc4, 8);
- SAVE_ACC (&acc6, 12);
- SAVE_ACC (&acc5, 10);
- SAVE_ACC (&acc7, 14);
- AO += temp << 4;
- BO += temp << 2;
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC (&acc2, 2);
+ SAVE_ACC1 (&acc3, 2);
+ SAVE_ACC (&acc4, 4);
+ SAVE_ACC1 (&acc5, 4);
+ SAVE_ACC (&acc6, 6);
+ SAVE_ACC1 (&acc7, 6);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 8)
+#endif
+ }
+ if (m & 4)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ BLASLONG l = 0;
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB, rowB1;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+ for (l = 1; l < temp; l++)
+ {
+ rowA = (vec_t *) & AO[l << 2];
+ rb = (vec_t *) & BO[l << 3];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC (&acc2, 2);
+ SAVE_ACC1 (&acc3, 2);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 8)
+#endif
+ }
+ if (m & 2)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ BLASLONG l = 0;
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB, rowB1;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ for (l = 1; l < temp; l++)
+ {
+ rowA = (vec_t *) & AO[l << 1];
+ rb = (vec_t *) & BO[l << 3];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC1 (&acc1, 0);
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 3;
#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 4)
+ REFRESH_AFTER_SAVE (2, 8)
#endif
- CO += 16;
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 1)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ v4sf_t t2 = { 0, 0 };
+ v4sf_t t3 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
+ v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
+ v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
+ v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
+ t += rowA * rowB;
+ t1 += rowA * rowB1;
+ t2 += rowA * rowB2;
+ t3 += rowA * rowB3;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+ t2 = t2 * valpha;
+ t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[2 * ldc] = t1[0];
+ CO[3 * ldc] = t1[1];
+ CO[4 * ldc] = t2[0];
+ CO[5 * ldc] = t2[1];
+ CO[6 * ldc] = t3[0];
+ CO[7 * ldc] = t3[1];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[2 * ldc] += t1[0];
+ CO[3 * ldc] += t1[1];
+ CO[4 * ldc] += t2[0];
+ CO[5 * ldc] += t2[1];
+ CO[6 * ldc] += t3[0];
+ CO[7 * ldc] += t3[1];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 8)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 8; // number of values in A
+#endif
+ B += k << 3;
+ }
+ if (n & 4)
+ {
+ BLASLONG j, temp;
+ FLOAT *CO;
+ FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ CO = C;
+ C += ldc << 2;
+ AO = A;
+ PREFETCH1 (A, 128);
+ PREFETCH1 (A, 256);
+ for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 4)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 4)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 4)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
B += k << 2;
}
- N = (n & 3) >> 1;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 2)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
CO = C;
C += ldc << 1;
AO = A;
- i = m >> 4;
- for (j = 0; j < i; j++)
- {
- FLOAT *BO;
-#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 2);
-#else
- BO = B;
- temp = k;
-#endif
- v4sf_t *rowC;
- v4sf_t result[4];
- __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- BLASLONG l = 0;
- FLOAT t[4] = { 0, 0, 0, 0 };
- t[0] = BO[0], t[1] = BO[1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[0];
- __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
- for (l = 1; l < temp; l++)
- {
- t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- rowA = (vec_t *) & AO[l << 4];
- __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
- }
- SAVE2x4_ACC (&acc0, 0);
- SAVE2x4_ACC (&acc1, 2);
- SAVE2x4_ACC (&acc2, 4);
- SAVE2x4_ACC (&acc3, 6);
- SAVE2x4_ACC (&acc4, 8);
- SAVE2x4_ACC (&acc5, 10);
- SAVE2x4_ACC (&acc6, 12);
- SAVE2x4_ACC (&acc7, 14);
- CO += 16;
- AO += temp << 4;
- BO += temp << 1;
-#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 2)
-#endif
- }
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 2)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 2)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
B += k << 1;
}
- N = (n & 1) >> 0;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
CO = C;
C += ldc;
AO = A;
- i = m;
- while (i >= 16)
- {
- FLOAT *BO;
-#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 1)
-#else
- BO = B;
- temp = k;
-#endif
- BLASLONG l = 0;
- v4sf_t t = { 0, 0 };
- v4sf_t t1 = { 0, 0 };
- v4sf_t t2 = { 0, 0 };
- v4sf_t t3 = { 0, 0 };
- v4sf_t t4 = { 0, 0 };
- v4sf_t t5 = { 0, 0 };
- v4sf_t t6 = { 0, 0 };
- v4sf_t t7 = { 0, 0 };
- for (l = 0; l < temp; l++)
- {
- v4sf_t rowB = { BO[l], BO[l] };
- v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
- v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
- v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
- v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
- v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
- v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
- v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
- v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
- t += rowA * rowB;
- t1 += rowA1 * rowB;
- t2 += rowA2 * rowB;
- t3 += rowA3 * rowB;
- t4 += rowA4 * rowB;
- t5 += rowA5 * rowB;
- t6 += rowA6 * rowB;
- t7 += rowA7 * rowB;
- }
- t = t * valpha;
- t1 = t1 * valpha;
- t2 = t2 * valpha;
- t3 = t3 * valpha;
- t4 = t4 * valpha;
- t5 = t5 * valpha;
- t6 = t6 * valpha;
- t7 = t7 * valpha;
-#if defined(TRMMKERNEL)
- CO[0] = t[0];
- CO[1] = t[1];
- CO[2] = t1[0];
- CO[3] = t1[1];
- CO[4] = t2[0];
- CO[5] = t2[1];
- CO[6] = t3[0];
- CO[7] = t3[1];
- CO[8] = t4[0];
- CO[9] = t4[1];
- CO[10] = t5[0];
- CO[11] = t5[1];
- CO[12] = t6[0];
- CO[13] = t6[1];
- CO[14] = t7[0];
- CO[15] = t7[1];
-#else
- CO[0] += t[0];
- CO[1] += t[1];
- CO[2] += t1[0];
- CO[3] += t1[1];
- CO[4] += t2[0];
- CO[5] += t2[1];
- CO[6] += t3[0];
- CO[7] += t3[1];
- CO[8] += t4[0];
- CO[9] += t4[1];
- CO[10] += t5[0];
- CO[11] += t5[1];
- CO[12] += t6[0];
- CO[13] += t6[1];
- CO[14] += t7[0];
- CO[15] += t7[1];
-#endif
- AO += temp << 4;
- BO += temp;
- CO += 16;
- i -= 16;
-#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 1)
-#endif
- }
- while (i >= 8)
+ for (i = 0; i < (m >> 3); i++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 3;
BO += temp;
CO += 8;
- i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
- while (i >= 4)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 2;
BO += temp;
CO += 4;
- i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
- while (i >= 2)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 1;
BO += temp;
CO += 2;
- i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
- while (i >= 1)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
CO[0] += t * alpha;
#endif
CO += 1;
- i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
--- /dev/null
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <altivec.h>
+#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+ BLASLONG i, j;
+
+ IFLOAT *aoffset;
+ IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+ IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+
+ IFLOAT *boffset;
+ IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ IFLOAT ctemp09, ctemp17, ctemp33;
+ IFLOAT ctemp25, ctemp41;
+ IFLOAT ctemp49, ctemp57;
+
+ aoffset = a;
+ boffset = b;
+
+ j = (n >> 3);
+ if (j > 0){
+ do{
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset3 = aoffset2 + lda;
+ aoffset4 = aoffset3 + lda;
+ aoffset5 = aoffset4 + lda;
+ aoffset6 = aoffset5 + lda;
+ aoffset7 = aoffset6 + lda;
+ aoffset8 = aoffset7 + lda;
+ aoffset += 8 * lda;
+
+ i = (m >> 3);
+ if (i > 0){
+ do{
+ PREFETCHA (aoffset1, 384);
+ PREFETCHA (aoffset2, 384);
+ PREFETCHA (aoffset3, 384);
+ PREFETCHA (aoffset4, 384);
+ PREFETCHA (aoffset5, 384);
+ PREFETCHA (aoffset6, 384);
+ PREFETCHA (aoffset7, 384);
+ PREFETCHA (aoffset8, 384);
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset1 + 2);
+ __vector double va2 = *(__vector double*)(aoffset1 + 4);
+ __vector double va3 = *(__vector double*)(aoffset1 + 6);
+
+ __vector double va4 = *(__vector double*)(aoffset2 + 0);
+ __vector double va5 = *(__vector double*)(aoffset2 + 2);
+ __vector double va6 = *(__vector double*)(aoffset2 + 4);
+ __vector double va7 = *(__vector double*)(aoffset2 + 6);
+
+ __vector double va8 = *(__vector double*)(aoffset3 + 0);
+ __vector double va9 = *(__vector double*)(aoffset3 + 2);
+ __vector double va10 = *(__vector double*)(aoffset3 + 4);
+ __vector double va11 = *(__vector double*)(aoffset3 + 6);
+
+ __vector double va12 = *(__vector double*)(aoffset4 + 0);
+ __vector double va13 = *(__vector double*)(aoffset4 + 2);
+ __vector double va14 = *(__vector double*)(aoffset4 + 4);
+ __vector double va15 = *(__vector double*)(aoffset4 + 6);
+
+ __vector double va16 = *(__vector double*)(aoffset5 + 0);
+ __vector double va17 = *(__vector double*)(aoffset5 + 2);
+ __vector double va18 = *(__vector double*)(aoffset5 + 4);
+ __vector double va19 = *(__vector double*)(aoffset5 + 6);
+
+ __vector double va20 = *(__vector double*)(aoffset6 + 0);
+ __vector double va21 = *(__vector double*)(aoffset6 + 2);
+ __vector double va22 = *(__vector double*)(aoffset6 + 4);
+ __vector double va23 = *(__vector double*)(aoffset6 + 6);
+
+ __vector double va24 = *(__vector double*)(aoffset7 + 0);
+ __vector double va25 = *(__vector double*)(aoffset7 + 2);
+ __vector double va26 = *(__vector double*)(aoffset7 + 4);
+ __vector double va27 = *(__vector double*)(aoffset7 + 6);
+
+ __vector double va28 = *(__vector double*)(aoffset8 + 0);
+ __vector double va29 = *(__vector double*)(aoffset8 + 2);
+ __vector double va30 = *(__vector double*)(aoffset8 + 4);
+ __vector double va31 = *(__vector double*)(aoffset8 + 6);
+
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0);
+ *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0);
+ *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0);
+ *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3);
+ *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3);
+ *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3);
+ *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3);
+
+ *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0);
+ *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0);
+ *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0);
+ *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0);
+ *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3);
+ *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3);
+ *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3);
+ *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3);
+
+ *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0);
+ *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0);
+ *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0);
+ *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0);
+ *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3);
+ *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3);
+ *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3);
+ *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3);
+
+ *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0);
+ *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0);
+ *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0);
+ *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0);
+ *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3);
+ *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3);
+ *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3);
+ *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3);
+ aoffset1 += 8;
+ aoffset2 += 8;
+ aoffset3 += 8;
+ aoffset4 += 8;
+ aoffset5 += 8;
+ aoffset6 += 8;
+ aoffset7 += 8;
+ aoffset8 += 8;
+ boffset += 64;
+ i --;
+ }while(i > 0);
+ }
+
+ i = (m & 7);
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+ ctemp09 = *(aoffset2 + 0);
+ ctemp17 = *(aoffset3 + 0);
+ ctemp25 = *(aoffset4 + 0);
+ ctemp33 = *(aoffset5 + 0);
+ ctemp41 = *(aoffset6 + 0);
+ ctemp49 = *(aoffset7 + 0);
+ ctemp57 = *(aoffset8 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp09;
+ *(boffset + 2) = ctemp17;
+ *(boffset + 3) = ctemp25;
+ *(boffset + 4) = ctemp33;
+ *(boffset + 5) = ctemp41;
+ *(boffset + 6) = ctemp49;
+ *(boffset + 7) = ctemp57;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ aoffset3 ++;
+ aoffset4 ++;
+ aoffset5 ++;
+ aoffset6 ++;
+ aoffset7 ++;
+ aoffset8 ++;
+
+ boffset += 8;
+ i --;
+ }while(i > 0);
+ }
+ j--;
+ }while(j > 0);
+ } /* end of if(j > 0) */
+
+ if (n & 4){
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset3 = aoffset2 + lda;
+ aoffset4 = aoffset3 + lda;
+ aoffset += 4 * lda;
+
+ i = (m >> 2);
+ if (i > 0){
+ do{
+ PREFETCHA (aoffset1, 384);
+ PREFETCHA (aoffset2, 384);
+ PREFETCHA (aoffset3, 384);
+ PREFETCHA (aoffset4, 384);
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset1 + 2);
+ __vector double va2 = *(__vector double*)(aoffset2 + 0);
+ __vector double va3 = *(__vector double*)(aoffset2 + 2);
+ __vector double va4 = *(__vector double*)(aoffset3 + 0);
+ __vector double va5 = *(__vector double*)(aoffset3 + 2);
+ __vector double va6 = *(__vector double*)(aoffset4 + 0);
+ __vector double va7 = *(__vector double*)(aoffset4 + 2);
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0);
+ *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3);
+ *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3);
+ *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0);
+ *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0);
+ *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3);
+ *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3);
+
+ aoffset1 += 4;
+ aoffset2 += 4;
+ aoffset3 += 4;
+ aoffset4 += 4;
+ boffset += 16;
+ i --;
+ }while(i > 0);
+ }
+
+ i = (m & 3);
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+ ctemp02 = *(aoffset2 + 0);
+ ctemp03 = *(aoffset3 + 0);
+ ctemp04 = *(aoffset4 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp02;
+ *(boffset + 2) = ctemp03;
+ *(boffset + 3) = ctemp04;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ aoffset3 ++;
+ aoffset4 ++;
+
+ boffset += 4;
+ i --;
+ }while(i > 0);
+ }
+ } /* end of if(j > 0) */
+
+ if (n & 2){
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset += 2 * lda;
+
+ i = (m >> 1);
+ if (i > 0){
+ do{
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset2 + 0);
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3);
+
+ aoffset1 += 2;
+ aoffset2 += 2;
+ boffset += 4;
+ i --;
+ }while(i > 0);
+ }
+
+ if (m & 1){
+ ctemp01 = *(aoffset1 + 0);
+ ctemp02 = *(aoffset2 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp02;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ boffset += 2;
+ }
+ } /* end of if(j > 0) */
+
+ if (n & 1){
+ aoffset1 = aoffset;
+
+ i = m;
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+
+ *(boffset + 0) = ctemp01;
+
+ aoffset1 ++;
+ boffset ++;
+ i --;
+ }while(i > 0);
+ }
+
+ } /* end of if(j > 0) */
+
+ return 0;
+}
#endif
)
{
- BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
#endif
v4sf_t valpha = { alpha, alpha, alpha, alpha };
- N = n >> 3;
- for (i1 = 0; i1 < N; i1++)
+ for (i1 = 0; i1 < (n >> 3); i1++)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
- i = m >> 4;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 4); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
CO += 16;
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 8)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 8)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 8)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
B += k << 3;
}
- N = (n & 7) >> 2;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 4)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
REFRESH_AFTER_SAVE (16, 4)
#endif
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 4)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 4)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 4)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
B += k << 2;
}
- N = (n & 3) >> 1;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 2)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
REFRESH_AFTER_SAVE (16, 2)
#endif
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
v4sf_t *rowC;
REFRESH_AFTER_SAVE (8, 2)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
v4sf_t *rowC;
REFRESH_AFTER_SAVE (4, 2)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
REFRESH_AFTER_SAVE (2, 2)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
BLASLONG l = 0;
B += k << 1;
}
- N = (n & 1) >> 0;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
CO = C;
C += ldc;
AO = A;
- i = m;
- while (i >= 16)
+ for (i = 0; i < (m >> 4); i++)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 4;
BO += temp;
CO += 16;
- i -= 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 1)
#endif
}
- while (i >= 8)
+ if (m & 8)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 3;
BO += temp;
CO += 8;
- i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
- while (i >= 4)
+ if (m & 4)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 2;
BO += temp;
CO += 4;
- i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
- while (i >= 2)
+ if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 1;
BO += temp;
CO += 2;
- i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
- while (i >= 1)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
CO[0] += t * alpha;
#endif
CO += 1;
- i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
#define SBGEMM_DEFAULT_P 832
#define SBGEMM_DEFAULT_Q 1026
#define SBGEMM_DEFAULT_R 4096
+#undef DGEMM_DEFAULT_UNROLL_M
+#undef DGEMM_DEFAULT_UNROLL_N
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
#endif
#if defined(SPARC) && defined(V7)