kernel/mips/sgemv_t_msa.c

   1 /*******************************************************************************
   2 Copyright (c) 2016, The OpenBLAS Project
   3 All rights reserved.
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are
   6 met:
   7 1. Redistributions of source code must retain the above copyright
   8 notice, this list of conditions and the following disclaimer.
   9 2. Redistributions in binary form must reproduce the above copyright
  10 notice, this list of conditions and the following disclaimer in
  11 the documentation and/or other materials provided with the
  12 distribution.
  13 3. Neither the name of the OpenBLAS project nor the names of
  14 its contributors may be used to endorse or promote products
  15 derived from this software without specific prior written permission.
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 *******************************************************************************/
  27
  28 #include "common.h"
  29 #include "macros_msa.h"
  30
  31 #define SGEMV_T_8x8()              \
  32 {                                  \
  33     LD_SP2(pa0 + k, 4, t0, t1);    \
  34     LD_SP2(pa1 + k, 4, t2, t3);    \
  35     LD_SP2(pa2 + k, 4, t4, t5);    \
  36     LD_SP2(pa3 + k, 4, t6, t7);    \
  37     LD_SP2(pa4 + k, 4, t8, t9);    \
  38     LD_SP2(pa5 + k, 4, t10, t11);  \
  39     LD_SP2(pa6 + k, 4, t12, t13);  \
  40     LD_SP2(pa7 + k, 4, t14, t15);  \
  41                                    \
  42     tp0 += x0 * t0;                \
  43     tp0 += x1 * t1;                \
  44                                    \
  45     tp1 += x0 * t2;                \
  46     tp1 += x1 * t3;                \
  47                                    \
  48     tp2 += x0 * t4;                \
  49     tp2 += x1 * t5;                \
  50                                    \
  51     tp3 += x0 * t6;                \
  52     tp3 += x1 * t7;                \
  53                                    \
  54     tp4 += x0 * t8;                \
  55     tp4 += x1 * t9;                \
  56                                    \
  57     tp5 += x0 * t10;               \
  58     tp5 += x1 * t11;               \
  59                                    \
  60     tp6 += x0 * t12;               \
  61     tp6 += x1 * t13;               \
  62                                    \
  63     tp7 += x0 * t14;               \
  64     tp7 += x1 * t15;               \
  65 }
  66
  67 #define SGEMV_T_8x4()      \
  68 {                          \
  69     t0  = LD_SP(pa0 + k);  \
  70     t2  = LD_SP(pa1 + k);  \
  71     t4  = LD_SP(pa2 + k);  \
  72     t6  = LD_SP(pa3 + k);  \
  73     t8  = LD_SP(pa4 + k);  \
  74     t10 = LD_SP(pa5 + k);  \
  75     t12 = LD_SP(pa6 + k);  \
  76     t14 = LD_SP(pa7 + k);  \
  77                            \
  78     tp0 += x0 * t0;        \
  79     tp1 += x0 * t2;        \
  80     tp2 += x0 * t4;        \
  81     tp3 += x0 * t6;        \
  82     tp4 += x0 * t8;        \
  83     tp5 += x0 * t10;       \
  84     tp6 += x0 * t12;       \
  85     tp7 += x0 * t14;       \
  86 }
  87
  88 #define SGEMV_T_4x8()            \
  89 {                                \
  90     LD_SP2(pa0 + k, 4, t0, t1);  \
  91     LD_SP2(pa1 + k, 4, t2, t3);  \
  92     LD_SP2(pa2 + k, 4, t4, t5);  \
  93     LD_SP2(pa3 + k, 4, t6, t7);  \
  94                                  \
  95     tp0 += x0 * t0;              \
  96     tp0 += x1 * t1;              \
  97                                  \
  98     tp1 += x0 * t2;              \
  99     tp1 += x1 * t3;              \
 100                                  \
 101     tp2 += x0 * t4;              \
 102     tp2 += x1 * t5;              \
 103                                  \
 104     tp3 += x0 * t6;              \
 105     tp3 += x1 * t7;              \
 106 }
 107
 108 #define SGEMV_T_4x4()     \
 109 {                         \
 110     t0 = LD_SP(pa0 + k);  \
 111     t2 = LD_SP(pa1 + k);  \
 112     t4 = LD_SP(pa2 + k);  \
 113     t6 = LD_SP(pa3 + k);  \
 114                           \
 115     tp0 += x0 * t0;       \
 116     tp1 += x0 * t2;       \
 117     tp2 += x0 * t4;       \
 118     tp3 += x0 * t6;       \
 119 }
 120
 121 #define SGEMV_T_2x8()            \
 122 {                                \
 123     LD_SP2(pa0 + k, 4, t0, t1);  \
 124     LD_SP2(pa1 + k, 4, t2, t3);  \
 125                                  \
 126     tp0 += x0 * t0;              \
 127     tp0 += x1 * t1;              \
 128                                  \
 129     tp1 += x0 * t2;              \
 130     tp1 += x1 * t3;              \
 131 }
 132
 133 #define SGEMV_T_2x4()     \
 134 {                         \
 135     t0 = LD_SP(pa0 + k);  \
 136     t2 = LD_SP(pa1 + k);  \
 137                           \
 138     tp0 += x0 * t0;       \
 139     tp1 += x0 * t2;       \
 140 }
 141
 142 #define SLOAD_X8_GP()                                                        \
 143     x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x)));  \
 144     x0 = (v4f32) __msa_insert_w((v4i32) x0,  1, *((int *)(x + 1 * inc_x)));  \
 145     x0 = (v4f32) __msa_insert_w((v4i32) x0,  2, *((int *)(x + 2 * inc_x)));  \
 146     x0 = (v4f32) __msa_insert_w((v4i32) x0,  3, *((int *)(x + 3 * inc_x)));  \
 147     x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x)));  \
 148     x1 = (v4f32) __msa_insert_w((v4i32) x1,  1, *((int *)(x + 5 * inc_x)));  \
 149     x1 = (v4f32) __msa_insert_w((v4i32) x1,  2, *((int *)(x + 6 * inc_x)));  \
 150     x1 = (v4f32) __msa_insert_w((v4i32) x1,  3, *((int *)(x + 7 * inc_x)));  \
 151
 152 #define SLOAD_X4_GP()                                                        \
 153     x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x)));  \
 154     x0 = (v4f32) __msa_insert_w((v4i32) x0,  1, *((int *)(x + 1 * inc_x)));  \
 155     x0 = (v4f32) __msa_insert_w((v4i32) x0,  2, *((int *)(x + 2 * inc_x)));  \
 156     x0 = (v4f32) __msa_insert_w((v4i32) x0,  3, *((int *)(x + 3 * inc_x)));  \
 157
 158 #define SLOAD_X8_VECTOR()  LD_SP2(x, 4, x0, x1);
 159 #define SLOAD_X4_VECTOR()  x0 = LD_SP(x);
 160
 161 #define SGEMV_T_MSA()                            \
 162     for (j = (n >> 3); j--;)                     \
 163     {                                            \
 164         tp0 = zero;                              \
 165         tp1 = zero;                              \
 166         tp2 = zero;                              \
 167         tp3 = zero;                              \
 168         tp4 = zero;                              \
 169         tp5 = zero;                              \
 170         tp6 = zero;                              \
 171         tp7 = zero;                              \
 172                                                  \
 173         k = 0;                                   \
 174         x = srcx_org;                            \
 175                                                  \
 176         for (i = (m >> 3); i--;)                 \
 177         {                                        \
 178             SLOAD_X8();                          \
 179             SGEMV_T_8x8();                       \
 180                                                  \
 181             x += 8 * inc_x;                      \
 182             k += 8;                              \
 183         }                                        \
 184                                                  \
 185         if (m & 4)                               \
 186         {                                        \
 187             SLOAD_X4();                          \
 188             SGEMV_T_8x4();                       \
 189                                                  \
 190             x += 4 * inc_x;                      \
 191             k += 4;                              \
 192         }                                        \
 193                                                  \
 194         TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3,   \
 195                            tp0, tp1, tp2, tp3);  \
 196         TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7,   \
 197                            tp4, tp5, tp6, tp7);  \
 198         tp0 += tp1;                              \
 199         tp0 += tp2;                              \
 200         tp0 += tp3;                              \
 201         tp4 += tp5;                              \
 202         tp4 += tp6;                              \
 203         tp4 += tp7;                              \
 204                                                  \
 205         temp0 = tp0[0];                          \
 206         temp1 = tp0[1];                          \
 207         temp2 = tp0[2];                          \
 208         temp3 = tp0[3];                          \
 209         temp4 = tp4[0];                          \
 210         temp5 = tp4[1];                          \
 211         temp6 = tp4[2];                          \
 212         temp7 = tp4[3];                          \
 213                                                  \
 214         for (i = (m & 3); i--;)                  \
 215         {                                        \
 216             temp0 += pa0[k] * x[0];              \
 217             temp1 += pa1[k] * x[0];              \
 218             temp2 += pa2[k] * x[0];              \
 219             temp3 += pa3[k] * x[0];              \
 220             temp4 += pa4[k] * x[0];              \
 221             temp5 += pa5[k] * x[0];              \
 222             temp6 += pa6[k] * x[0];              \
 223             temp7 += pa7[k] * x[0];              \
 224                                                  \
 225             x += inc_x;                          \
 226             k++;                                 \
 227         }                                        \
 228                                                  \
 229         res0 = y[0 * inc_y];                     \
 230         res1 = y[1 * inc_y];                     \
 231         res2 = y[2 * inc_y];                     \
 232         res3 = y[3 * inc_y];                     \
 233         res4 = y[4 * inc_y];                     \
 234         res5 = y[5 * inc_y];                     \
 235         res6 = y[6 * inc_y];                     \
 236         res7 = y[7 * inc_y];                     \
 237                                                  \
 238         res0 += alpha * temp0;                   \
 239         res1 += alpha * temp1;                   \
 240         res2 += alpha * temp2;                   \
 241         res3 += alpha * temp3;                   \
 242         res4 += alpha * temp4;                   \
 243         res5 += alpha * temp5;                   \
 244         res6 += alpha * temp6;                   \
 245         res7 += alpha * temp7;                   \
 246                                                  \
 247         y[0 * inc_y] = res0;                     \
 248         y[1 * inc_y] = res1;                     \
 249         y[2 * inc_y] = res2;                     \
 250         y[3 * inc_y] = res3;                     \
 251         y[4 * inc_y] = res4;                     \
 252         y[5 * inc_y] = res5;                     \
 253         y[6 * inc_y] = res6;                     \
 254         y[7 * inc_y] = res7;                     \
 255                                                  \
 256         y += 8 * inc_y;                          \
 257                                                  \
 258         pa0 += 8 * lda;                          \
 259         pa1 += 8 * lda;                          \
 260         pa2 += 8 * lda;                          \
 261         pa3 += 8 * lda;                          \
 262         pa4 += 8 * lda;                          \
 263         pa5 += 8 * lda;                          \
 264         pa6 += 8 * lda;                          \
 265         pa7 += 8 * lda;                          \
 266     }                                            \
 267                                                  \
 268     if (n & 4)                                   \
 269     {                                            \
 270         tp0 = zero;                              \
 271         tp1 = zero;                              \
 272         tp2 = zero;                              \
 273         tp3 = zero;                              \
 274                                                  \
 275         k = 0;                                   \
 276         x = srcx_org;                            \
 277                                                  \
 278         for (i = (m >> 3); i--;)                 \
 279         {                                        \
 280             SLOAD_X8();                          \
 281             SGEMV_T_4x8();                       \
 282                                                  \
 283             x += 8 * inc_x;                      \
 284             k += 8;                              \
 285         }                                        \
 286                                                  \
 287         if (m & 4)                               \
 288         {                                        \
 289             SLOAD_X4();                          \
 290             SGEMV_T_4x4();                       \
 291                                                  \
 292             x += 4 * inc_x;                      \
 293             k += 4;                              \
 294         }                                        \
 295                                                  \
 296         TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3,   \
 297                            tp0, tp1, tp2, tp3);  \
 298         tp0 += tp1;                              \
 299         tp0 += tp2;                              \
 300         tp0 += tp3;                              \
 301                                                  \
 302         temp0 = tp0[0];                          \
 303         temp1 = tp0[1];                          \
 304         temp2 = tp0[2];                          \
 305         temp3 = tp0[3];                          \
 306                                                  \
 307         for (i = (m & 3); i--;)                  \
 308         {                                        \
 309             temp0 += pa0[k] * x[0];              \
 310             temp1 += pa1[k] * x[0];              \
 311             temp2 += pa2[k] * x[0];              \
 312             temp3 += pa3[k] * x[0];              \
 313                                                  \
 314             x += inc_x;                          \
 315             k++;                                 \
 316         }                                        \
 317                                                  \
 318         res0 = y[0 * inc_y];                     \
 319         res1 = y[1 * inc_y];                     \
 320         res2 = y[2 * inc_y];                     \
 321         res3 = y[3 * inc_y];                     \
 322                                                  \
 323         res0 += alpha * temp0;                   \
 324         res1 += alpha * temp1;                   \
 325         res2 += alpha * temp2;                   \
 326         res3 += alpha * temp3;                   \
 327                                                  \
 328         y[0 * inc_y] = res0;                     \
 329         y[1 * inc_y] = res1;                     \
 330         y[2 * inc_y] = res2;                     \
 331         y[3 * inc_y] = res3;                     \
 332                                                  \
 333         y += 4 * inc_y;                          \
 334                                                  \
 335         pa0 += 4 * lda;                          \
 336         pa1 += 4 * lda;                          \
 337         pa2 += 4 * lda;                          \
 338         pa3 += 4 * lda;                          \
 339     }                                            \
 340                                                  \
 341     if (n & 2)                                   \
 342     {                                            \
 343         tp0 = zero;                              \
 344         tp1 = zero;                              \
 345                                                  \
 346         k = 0;                                   \
 347         x = srcx_org;                            \
 348                                                  \
 349         for (i = (m >> 3); i--;)                 \
 350         {                                        \
 351             SLOAD_X8();                          \
 352             SGEMV_T_2x8();                       \
 353                                                  \
 354             x += 8 * inc_x;                      \
 355             k += 8;                              \
 356         }                                        \
 357                                                  \
 358         if (m & 4)                               \
 359         {                                        \
 360             SLOAD_X4();                          \
 361             SGEMV_T_2x4();                       \
 362                                                  \
 363             x += 4 * inc_x;                      \
 364             k += 4;                              \
 365         }                                        \
 366                                                  \
 367         ILVRL_W2_SP(tp1, tp0, tp2, tp3);         \
 368                                                  \
 369         tp2 += tp3;                              \
 370                                                  \
 371         temp0 = tp2[0] + tp2[2];                 \
 372         temp1 = tp2[1] + tp2[3];                 \
 373                                                  \
 374         for (i = (m & 3); i--;)                  \
 375         {                                        \
 376             temp0 += pa0[k] * x[0];              \
 377             temp1 += pa1[k] * x[0];              \
 378                                                  \
 379             x += inc_x;                          \
 380             k++;                                 \
 381         }                                        \
 382                                                  \
 383         res0 = y[0 * inc_y];                     \
 384         res1 = y[1 * inc_y];                     \
 385                                                  \
 386         res0 += alpha * temp0;                   \
 387         res1 += alpha * temp1;                   \
 388                                                  \
 389         y[0 * inc_y] = res0;                     \
 390         y[1 * inc_y] = res1;                     \
 391                                                  \
 392         y += 2 * inc_y;                          \
 393                                                  \
 394         pa0 += 2 * lda;                          \
 395         pa1 += 2 * lda;                          \
 396     }                                            \
 397                                                  \
 398     if (n & 1)                                   \
 399     {                                            \
 400         temp0 = 0.0;                             \
 401                                                  \
 402         k = 0;                                   \
 403         x = srcx_org;                            \
 404                                                  \
 405         for (i = m; i--;)                        \
 406         {                                        \
 407             temp0 += pa0[k] * x[0];              \
 408                                                  \
 409             x += inc_x;                          \
 410             k++;                                 \
 411         }                                        \
 412                                                  \
 413         y[0] += alpha * temp0;                   \
 414         y += inc_y;                              \
 415         pa0 += lda;                              \
 416     }
 417
 418 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
 419           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
 420           FLOAT *buffer)
 421 {
 422     BLASLONG i, j, k;
 423     FLOAT *srcx_org = x;
 424     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
 425     FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 426     FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
 427     v4f32 x0, x1;
 428     v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
 429     v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 430     v4f32 zero = {0};
 431
 432     pa0 = A + 0 * lda;
 433     pa1 = A + 1 * lda;
 434     pa2 = A + 2 * lda;
 435     pa3 = A + 3 * lda;
 436     pa4 = A + 4 * lda;
 437     pa5 = A + 5 * lda;
 438     pa6 = A + 6 * lda;
 439     pa7 = A + 7 * lda;
 440
 441     if (1 == inc_x)
 442     {
 443         #define SLOAD_X8  SLOAD_X8_VECTOR
 444         #define SLOAD_X4  SLOAD_X4_VECTOR
 445
 446         SGEMV_T_MSA();
 447
 448         #undef SLOAD_X8
 449         #undef SLOAD_X4
 450     }
 451     else
 452     {
 453         #define SLOAD_X8  SLOAD_X8_GP
 454         #define SLOAD_X4  SLOAD_X4_GP
 455
 456         SGEMV_T_MSA();
 457
 458         #undef SLOAD_X8
 459         #undef SLOAD_X4
 460     }
 461
 462     return(0);
 463 }