kernel/mips/sgemv_n_msa.c

   1 /*******************************************************************************
   2 Copyright (c) 2016, The OpenBLAS Project
   3 All rights reserved.
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are
   6 met:
   7 1. Redistributions of source code must retain the above copyright
   8 notice, this list of conditions and the following disclaimer.
   9 2. Redistributions in binary form must reproduce the above copyright
  10 notice, this list of conditions and the following disclaimer in
  11 the documentation and/or other materials provided with the
  12 distribution.
  13 3. Neither the name of the OpenBLAS project nor the names of
  14 its contributors may be used to endorse or promote products
  15 derived from this software without specific prior written permission.
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 *******************************************************************************/
  27
  28 #include "common.h"
  29 #include "macros_msa.h"
  30
  31 #define SGEMV_N_8x8()              \
  32 {                                  \
  33     LD_SP2(pa0 + k, 4, t0, t1);    \
  34     LD_SP2(pa1 + k, 4, t2, t3);    \
  35     LD_SP2(pa2 + k, 4, t4, t5);    \
  36     LD_SP2(pa3 + k, 4, t6, t7);    \
  37     LD_SP2(pa4 + k, 4, t8, t9);    \
  38     LD_SP2(pa5 + k, 4, t10, t11);  \
  39     LD_SP2(pa6 + k, 4, t12, t13);  \
  40     LD_SP2(pa7 + k, 4, t14, t15);  \
  41                                    \
  42     y0 += tp0 * t0;                \
  43     y1 += tp0 * t1;                \
  44                                    \
  45     y0 += tp1 * t2;                \
  46     y1 += tp1 * t3;                \
  47                                    \
  48     y0 += tp2 * t4;                \
  49     y1 += tp2 * t5;                \
  50                                    \
  51     y0 += tp3 * t6;                \
  52     y1 += tp3 * t7;                \
  53                                    \
  54     y0 += tp4 * t8;                \
  55     y1 += tp4 * t9;                \
  56                                    \
  57     y0 += tp5 * t10;               \
  58     y1 += tp5 * t11;               \
  59                                    \
  60     y0 += tp6 * t12;               \
  61     y1 += tp6 * t13;               \
  62                                    \
  63     y0 += tp7 * t14;               \
  64     y1 += tp7 * t15;               \
  65 }
  66
  67 #define SGEMV_N_4x8()      \
  68 {                          \
  69     t0  = LD_SP(pa0 + k);  \
  70     t2  = LD_SP(pa1 + k);  \
  71     t4  = LD_SP(pa2 + k);  \
  72     t6  = LD_SP(pa3 + k);  \
  73     t8  = LD_SP(pa4 + k);  \
  74     t10 = LD_SP(pa5 + k);  \
  75     t12 = LD_SP(pa6 + k);  \
  76     t14 = LD_SP(pa7 + k);  \
  77                            \
  78     y0 += tp0 * t0;        \
  79     y0 += tp1 * t2;        \
  80     y0 += tp2 * t4;        \
  81     y0 += tp3 * t6;        \
  82     y0 += tp4 * t8;        \
  83     y0 += tp5 * t10;       \
  84     y0 += tp6 * t12;       \
  85     y0 += tp7 * t14;       \
  86 }
  87
  88 #define SGEMV_N_8x4()            \
  89 {                                \
  90     LD_SP2(pa0 + k, 4, t0, t1);  \
  91     LD_SP2(pa1 + k, 4, t2, t3);  \
  92     LD_SP2(pa2 + k, 4, t4, t5);  \
  93     LD_SP2(pa3 + k, 4, t6, t7);  \
  94                                  \
  95     y0 += tp0 * t0;              \
  96     y1 += tp0 * t1;              \
  97                                  \
  98     y0 += tp1 * t2;              \
  99     y1 += tp1 * t3;              \
 100                                  \
 101     y0 += tp2 * t4;              \
 102     y1 += tp2 * t5;              \
 103                                  \
 104     y0 += tp3 * t6;              \
 105     y1 += tp3 * t7;              \
 106 }
 107
 108 #define SGEMV_N_4x4()      \
 109 {                          \
 110     t0  = LD_SP(pa0 + k);  \
 111     t2  = LD_SP(pa1 + k);  \
 112     t4  = LD_SP(pa2 + k);  \
 113     t6  = LD_SP(pa3 + k);  \
 114                            \
 115     y0 += tp0 * t0;        \
 116     y0 += tp1 * t2;        \
 117     y0 += tp2 * t4;        \
 118     y0 += tp3 * t6;        \
 119 }
 120
 121 #define SGEMV_N_8x2()            \
 122 {                                \
 123     LD_SP2(pa0 + k, 4, t0, t1);  \
 124     LD_SP2(pa1 + k, 4, t2, t3);  \
 125                                  \
 126     y0 += tp0 * t0;              \
 127     y1 += tp0 * t1;              \
 128                                  \
 129     y0 += tp1 * t2;              \
 130     y1 += tp1 * t3;              \
 131 }
 132
 133 #define SGEMV_N_4x2()      \
 134 {                          \
 135     t0  = LD_SP(pa0 + k);  \
 136     t2  = LD_SP(pa1 + k);  \
 137                            \
 138     y0 += tp0 * t0;        \
 139     y0 += tp1 * t2;        \
 140 }
 141
 142 #define SLOAD_X8_SCALE_GP()             \
 143     temp0 = alpha * x[0 * inc_x];       \
 144     temp1 = alpha * x[1 * inc_x];       \
 145     temp2 = alpha * x[2 * inc_x];       \
 146     temp3 = alpha * x[3 * inc_x];       \
 147     temp4 = alpha * x[4 * inc_x];       \
 148     temp5 = alpha * x[5 * inc_x];       \
 149     temp6 = alpha * x[6 * inc_x];       \
 150     temp7 = alpha * x[7 * inc_x];       \
 151                                         \
 152     tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
 153     tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
 154     tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
 155     tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
 156     tp4 = COPY_FLOAT_TO_VECTOR(temp4);  \
 157     tp5 = COPY_FLOAT_TO_VECTOR(temp5);  \
 158     tp6 = COPY_FLOAT_TO_VECTOR(temp6);  \
 159     tp7 = COPY_FLOAT_TO_VECTOR(temp7);  \
 160
 161 #define SLOAD_X4_SCALE_GP()             \
 162     temp0 = alpha * x[0 * inc_x];       \
 163     temp1 = alpha * x[1 * inc_x];       \
 164     temp2 = alpha * x[2 * inc_x];       \
 165     temp3 = alpha * x[3 * inc_x];       \
 166                                         \
 167     tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
 168     tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
 169     tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
 170     tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
 171
 172 #define SLOAD_X8_SCALE_VECTOR()            \
 173     LD_SP2(x, 4, x0, x1);                  \
 174                                            \
 175     x0 = x0 * v_alpha;                     \
 176     x1 = x1 * v_alpha;                     \
 177                                            \
 178     SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
 179     SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7);  \
 180
 181 #define SLOAD_X4_SCALE_VECTOR()            \
 182     x0 = LD_SP(x);                         \
 183     x0 = x0 * v_alpha;                     \
 184     SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
 185
 186 #define SLOAD_Y8_GP()                                                        \
 187     y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
 188     y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
 189     y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
 190     y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
 191     y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y)));  \
 192     y1 = (v4f32) __msa_insert_w((v4i32) y1,  1, *((int *)(y + 5 * inc_y)));  \
 193     y1 = (v4f32) __msa_insert_w((v4i32) y1,  2, *((int *)(y + 6 * inc_y)));  \
 194     y1 = (v4f32) __msa_insert_w((v4i32) y1,  3, *((int *)(y + 7 * inc_y)));  \
 195
 196 #define SLOAD_Y4_GP()                                                        \
 197     y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
 198     y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
 199     y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
 200     y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
 201
 202 #define SLOAD_Y8_VECTOR()  LD_SP2(y, 4, y0, y1);
 203 #define SLOAD_Y4_VECTOR()  y0 = LD_SP(y);
 204
 205 #define SSTORE_Y8_GP()                                          \
 206     *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
 207     *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
 208     *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
 209     *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
 210     *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0);  \
 211     *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1);  \
 212     *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2);  \
 213     *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3);  \
 214
 215 #define SSTORE_Y4_GP()                                          \
 216     *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
 217     *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
 218     *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
 219     *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
 220
 221 #define SSTORE_Y8_VECTOR()  ST_SP2(y0, y1, y, 4);
 222 #define SSTORE_Y4_VECTOR()  ST_SP(y0, y);
 223
 224 #define SGEMV_N_MSA()                       \
 225     for (j = (n >> 3); j--;)                \
 226     {                                       \
 227         SLOAD_X8_SCALE();                   \
 228                                             \
 229         k = 0;                              \
 230         y = y_org;                          \
 231                                             \
 232         for (i = (m >> 3); i--;)            \
 233         {                                   \
 234             SLOAD_Y8();                     \
 235             SGEMV_N_8x8();                  \
 236             SSTORE_Y8();                    \
 237                                             \
 238             y += 8 * inc_y;                 \
 239             k += 8;                         \
 240         }                                   \
 241                                             \
 242         if (m & 4)                          \
 243         {                                   \
 244             SLOAD_Y4();                     \
 245             SGEMV_N_4x8();                  \
 246             SSTORE_Y4();                    \
 247                                             \
 248             y += 4 * inc_y;                 \
 249             k += 4;                         \
 250         }                                   \
 251                                             \
 252         if (m & 3)                          \
 253         {                                   \
 254             temp0 = alpha * x[0 * inc_x];   \
 255             temp1 = alpha * x[1 * inc_x];   \
 256             temp2 = alpha * x[2 * inc_x];   \
 257             temp3 = alpha * x[3 * inc_x];   \
 258             temp4 = alpha * x[4 * inc_x];   \
 259             temp5 = alpha * x[5 * inc_x];   \
 260             temp6 = alpha * x[6 * inc_x];   \
 261             temp7 = alpha * x[7 * inc_x];   \
 262                                             \
 263             for (i = (m & 3); i--;)         \
 264             {                               \
 265                 temp = y[0];                \
 266                 temp += temp0 * pa0[k];     \
 267                 temp += temp1 * pa1[k];     \
 268                 temp += temp2 * pa2[k];     \
 269                 temp += temp3 * pa3[k];     \
 270                 temp += temp4 * pa4[k];     \
 271                 temp += temp5 * pa5[k];     \
 272                 temp += temp6 * pa6[k];     \
 273                 temp += temp7 * pa7[k];     \
 274                 y[0] = temp;                \
 275                                             \
 276                 y += inc_y;                 \
 277                 k++;                        \
 278             }                               \
 279         }                                   \
 280         pa0 += 8 * lda;                     \
 281         pa1 += 8 * lda;                     \
 282         pa2 += 8 * lda;                     \
 283         pa3 += 8 * lda;                     \
 284         pa4 += 8 * lda;                     \
 285         pa5 += 8 * lda;                     \
 286         pa6 += 8 * lda;                     \
 287         pa7 += 8 * lda;                     \
 288                                             \
 289         x += 8 * inc_x;                     \
 290     }                                       \
 291                                             \
 292     if (n & 4)                              \
 293     {                                       \
 294         SLOAD_X4_SCALE();                   \
 295                                             \
 296         k = 0;                              \
 297         y = y_org;                          \
 298                                             \
 299         for (i = (m >> 3); i--;)            \
 300         {                                   \
 301             SLOAD_Y8();                     \
 302             SGEMV_N_8x4();                  \
 303             SSTORE_Y8();                    \
 304                                             \
 305             y += 8 * inc_y;                 \
 306             k += 8;                         \
 307         }                                   \
 308                                             \
 309         if (m & 4)                          \
 310         {                                   \
 311             SLOAD_Y4();                     \
 312             SGEMV_N_4x4();                  \
 313             SSTORE_Y4();                    \
 314                                             \
 315             y += 4 * inc_y;                 \
 316             k += 4;                         \
 317         }                                   \
 318                                             \
 319         if (m & 3)                          \
 320         {                                   \
 321             temp0 = alpha * x[0 * inc_x];   \
 322             temp1 = alpha * x[1 * inc_x];   \
 323             temp2 = alpha * x[2 * inc_x];   \
 324             temp3 = alpha * x[3 * inc_x];   \
 325                                             \
 326             for (i = (m & 3); i--;)         \
 327             {                               \
 328                 temp = y[0];                \
 329                 temp += temp0 * pa0[k];     \
 330                 temp += temp1 * pa1[k];     \
 331                 temp += temp2 * pa2[k];     \
 332                 temp += temp3 * pa3[k];     \
 333                 y[0] = temp;                \
 334                                             \
 335                 y += inc_y;                 \
 336                 k++;                        \
 337             }                               \
 338         }                                   \
 339                                             \
 340         pa0 += 4 * lda;                     \
 341         pa1 += 4 * lda;                     \
 342         pa2 += 4 * lda;                     \
 343         pa3 += 4 * lda;                     \
 344                                             \
 345         x += 4 * inc_x;                     \
 346     }                                       \
 347                                             \
 348     if (n & 2)                              \
 349     {                                       \
 350         temp0 = alpha * x[0 * inc_x];       \
 351         temp1 = alpha * x[1 * inc_x];       \
 352                                             \
 353         tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
 354         tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
 355                                             \
 356         k = 0;                              \
 357         y = y_org;                          \
 358                                             \
 359         for (i = (m >> 3); i--;)            \
 360         {                                   \
 361             SLOAD_Y8();                     \
 362             SGEMV_N_8x2();                  \
 363             SSTORE_Y8();                    \
 364                                             \
 365             y += 8 * inc_y;                 \
 366             k += 8;                         \
 367         }                                   \
 368                                             \
 369         if (m & 4)                          \
 370         {                                   \
 371             SLOAD_Y4();                     \
 372             SGEMV_N_4x2();                  \
 373             SSTORE_Y4();                    \
 374                                             \
 375             y += 4 * inc_y;                 \
 376             k += 4;                         \
 377         }                                   \
 378                                             \
 379         if (m & 3)                          \
 380         {                                   \
 381             temp0 = alpha * x[0 * inc_x];   \
 382             temp1 = alpha * x[1 * inc_x];   \
 383                                             \
 384             for (i = (m & 3); i--;)         \
 385             {                               \
 386                 temp = y[0];                \
 387                 temp += temp0 * pa0[k];     \
 388                 temp += temp1 * pa1[k];     \
 389                 y[0] = temp;                \
 390                                             \
 391                 y += inc_y;                 \
 392                 k++;                        \
 393             }                               \
 394         }                                   \
 395                                             \
 396         pa0 += 2 * lda;                     \
 397         pa1 += 2 * lda;                     \
 398                                             \
 399         x += 2 * inc_x;                     \
 400     }                                       \
 401                                             \
 402     if (n & 1)                              \
 403     {                                       \
 404         temp = alpha * x[0];                \
 405                                             \
 406         k = 0;                              \
 407         y = y_org;                          \
 408                                             \
 409         for (i = m; i--;)                   \
 410         {                                   \
 411            y[0] += temp * pa0[k];           \
 412                                             \
 413            y += inc_y;                      \
 414            k++;                             \
 415         }                                   \
 416     }                                       \
 417
 418 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
 419           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
 420           FLOAT *buffer)
 421 {
 422     BLASLONG i, j, k;
 423     FLOAT *y_org = y;
 424     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
 425     FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 426     v4f32 v_alpha, x0, x1, y0, y1;
 427     v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
 428     v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 429
 430     v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
 431
 432     pa0 = A;
 433     pa1 = A + lda;
 434     pa2 = A + 2 * lda;
 435     pa3 = A + 3 * lda;
 436     pa4 = A + 4 * lda;
 437     pa5 = A + 5 * lda;
 438     pa6 = A + 6 * lda;
 439     pa7 = A + 7 * lda;
 440
 441     if ((1 == inc_x) && (1 == inc_y))
 442     {
 443         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
 444         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
 445         #define SLOAD_Y8   SLOAD_Y8_VECTOR
 446         #define SLOAD_Y4   SLOAD_Y4_VECTOR
 447         #define SSTORE_Y8  SSTORE_Y8_VECTOR
 448         #define SSTORE_Y4  SSTORE_Y4_VECTOR
 449
 450         SGEMV_N_MSA();
 451
 452         #undef SLOAD_X8_SCALE
 453         #undef SLOAD_X4_SCALE
 454         #undef SLOAD_Y8
 455         #undef SLOAD_Y4
 456         #undef SSTORE_Y8
 457         #undef SSTORE_Y4
 458     }
 459     else if (1 == inc_y)
 460     {
 461         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
 462         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
 463         #define SLOAD_Y8   SLOAD_Y8_VECTOR
 464         #define SLOAD_Y4   SLOAD_Y4_VECTOR
 465         #define SSTORE_Y8  SSTORE_Y8_VECTOR
 466         #define SSTORE_Y4  SSTORE_Y4_VECTOR
 467
 468         SGEMV_N_MSA();
 469
 470         #undef SLOAD_X8_SCALE
 471         #undef SLOAD_X4_SCALE
 472         #undef SLOAD_Y8
 473         #undef SLOAD_Y4
 474         #undef SSTORE_Y8
 475         #undef SSTORE_Y4
 476     }
 477     else if (1 == inc_x)
 478     {
 479         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
 480         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
 481         #define SLOAD_Y8   SLOAD_Y8_GP
 482         #define SLOAD_Y4   SLOAD_Y4_GP
 483         #define SSTORE_Y8  SSTORE_Y8_GP
 484         #define SSTORE_Y4  SSTORE_Y4_GP
 485
 486         SGEMV_N_MSA();
 487
 488         #undef SLOAD_X8_SCALE
 489         #undef SLOAD_X4_SCALE
 490         #undef SLOAD_Y8
 491         #undef SLOAD_Y4
 492         #undef SSTORE_Y8
 493         #undef SSTORE_Y4
 494     }
 495     else
 496     {
 497         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
 498         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
 499         #define SLOAD_Y8   SLOAD_Y8_GP
 500         #define SLOAD_Y4   SLOAD_Y4_GP
 501         #define SSTORE_Y8  SSTORE_Y8_GP
 502         #define SSTORE_Y4  SSTORE_Y4_GP
 503
 504         SGEMV_N_MSA();
 505
 506         #undef SLOAD_X8_SCALE
 507         #undef SLOAD_X4_SCALE
 508         #undef SLOAD_Y8
 509         #undef SLOAD_Y4
 510         #undef SSTORE_Y8
 511         #undef SSTORE_Y4
 512     }
 513
 514     return(0);
 515 }