kernel/mips/dgemv_n_msa.c

   1 /*******************************************************************************
   2 Copyright (c) 2016, The OpenBLAS Project
   3 All rights reserved.
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are
   6 met:
   7 1. Redistributions of source code must retain the above copyright
   8 notice, this list of conditions and the following disclaimer.
   9 2. Redistributions in binary form must reproduce the above copyright
  10 notice, this list of conditions and the following disclaimer in
  11 the documentation and/or other materials provided with the
  12 distribution.
  13 3. Neither the name of the OpenBLAS project nor the names of
  14 its contributors may be used to endorse or promote products
  15 derived from this software without specific prior written permission.
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 *******************************************************************************/
  27
  28 #include "common.h"
  29 #include "macros_msa.h"
  30
  31 #define DGEMV_N_8x8()                        \
  32 {                                            \
  33     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
  34     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
  35     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
  36     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
  37     LD_DP4(pa4 + k, 2, t16, t17, t18, t19);  \
  38     LD_DP4(pa5 + k, 2, t20, t21, t22, t23);  \
  39     LD_DP4(pa6 + k, 2, t24, t25, t26, t27);  \
  40     LD_DP4(pa7 + k, 2, t28, t29, t30, t31);  \
  41                                              \
  42     y0 += tp0 * t0;                          \
  43     y1 += tp0 * t1;                          \
  44     y2 += tp0 * t2;                          \
  45     y3 += tp0 * t3;                          \
  46                                              \
  47     y0 += tp1 * t4;                          \
  48     y1 += tp1 * t5;                          \
  49     y2 += tp1 * t6;                          \
  50     y3 += tp1 * t7;                          \
  51                                              \
  52     y0 += tp2 * t8;                          \
  53     y1 += tp2 * t9;                          \
  54     y2 += tp2 * t10;                         \
  55     y3 += tp2 * t11;                         \
  56                                              \
  57     y0 += tp3 * t12;                         \
  58     y1 += tp3 * t13;                         \
  59     y2 += tp3 * t14;                         \
  60     y3 += tp3 * t15;                         \
  61                                              \
  62     y0 += tp4 * t16;                         \
  63     y1 += tp4 * t17;                         \
  64     y2 += tp4 * t18;                         \
  65     y3 += tp4 * t19;                         \
  66                                              \
  67     y0 += tp5 * t20;                         \
  68     y1 += tp5 * t21;                         \
  69     y2 += tp5 * t22;                         \
  70     y3 += tp5 * t23;                         \
  71                                              \
  72     y0 += tp6 * t24;                         \
  73     y1 += tp6 * t25;                         \
  74     y2 += tp6 * t26;                         \
  75     y3 += tp6 * t27;                         \
  76                                              \
  77     y0 += tp7 * t28;                         \
  78     y1 += tp7 * t29;                         \
  79     y2 += tp7 * t30;                         \
  80     y3 += tp7 * t31;                         \
  81 }
  82
  83 #define DGEMV_N_4x8()              \
  84 {                                  \
  85     LD_DP2(pa0 + k, 2, t0, t1);    \
  86     LD_DP2(pa1 + k, 2, t4, t5);    \
  87     LD_DP2(pa2 + k, 2, t8, t9);    \
  88     LD_DP2(pa3 + k, 2, t12, t13);  \
  89     LD_DP2(pa4 + k, 2, t16, t17);  \
  90     LD_DP2(pa5 + k, 2, t20, t21);  \
  91     LD_DP2(pa6 + k, 2, t24, t25);  \
  92     LD_DP2(pa7 + k, 2, t28, t29);  \
  93                                    \
  94     y0 += tp0 * t0;                \
  95     y1 += tp0 * t1;                \
  96                                    \
  97     y0 += tp1 * t4;                \
  98     y1 += tp1 * t5;                \
  99                                    \
 100     y0 += tp2 * t8;                \
 101     y1 += tp2 * t9;                \
 102                                    \
 103     y0 += tp3 * t12;               \
 104     y1 += tp3 * t13;               \
 105                                    \
 106     y0 += tp4 * t16;               \
 107     y1 += tp4 * t17;               \
 108                                    \
 109     y0 += tp5 * t20;               \
 110     y1 += tp5 * t21;               \
 111                                    \
 112     y0 += tp6 * t24;               \
 113     y1 += tp6 * t25;               \
 114                                    \
 115     y0 += tp7 * t28;               \
 116     y1 += tp7 * t29;               \
 117 }
 118
 119 #define DGEMV_N_8x4()                        \
 120 {                                            \
 121     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
 122     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
 123     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
 124     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
 125                                              \
 126     y0 += tp0 * t0;                          \
 127     y1 += tp0 * t1;                          \
 128     y2 += tp0 * t2;                          \
 129     y3 += tp0 * t3;                          \
 130                                              \
 131     y0 += tp1 * t4;                          \
 132     y1 += tp1 * t5;                          \
 133     y2 += tp1 * t6;                          \
 134     y3 += tp1 * t7;                          \
 135                                              \
 136     y0 += tp2 * t8;                          \
 137     y1 += tp2 * t9;                          \
 138     y2 += tp2 * t10;                         \
 139     y3 += tp2 * t11;                         \
 140                                              \
 141     y0 += tp3 * t12;                         \
 142     y1 += tp3 * t13;                         \
 143     y2 += tp3 * t14;                         \
 144     y3 += tp3 * t15;                         \
 145 }
 146
 147 #define DGEMV_N_4x4()              \
 148 {                                  \
 149     LD_DP2(pa0 + k, 2, t0, t1);    \
 150     LD_DP2(pa1 + k, 2, t4, t5);    \
 151     LD_DP2(pa2 + k, 2, t8, t9);    \
 152     LD_DP2(pa3 + k, 2, t12, t13);  \
 153                                    \
 154     y0 += tp0 * t0;                \
 155     y1 += tp0 * t1;                \
 156                                    \
 157     y0 += tp1 * t4;                \
 158     y1 += tp1 * t5;                \
 159                                    \
 160     y0 += tp2 * t8;                \
 161     y1 += tp2 * t9;                \
 162                                    \
 163     y0 += tp3 * t12;               \
 164     y1 += tp3 * t13;               \
 165 }
 166
 167 #define DGEMV_N_8x2()                    \
 168 {                                        \
 169     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);  \
 170     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);  \
 171                                          \
 172     y0 += tp0 * t0;                      \
 173     y1 += tp0 * t1;                      \
 174     y2 += tp0 * t2;                      \
 175     y3 += tp0 * t3;                      \
 176                                          \
 177     y0 += tp1 * t4;                      \
 178     y1 += tp1 * t5;                      \
 179     y2 += tp1 * t6;                      \
 180     y3 += tp1 * t7;                      \
 181 }
 182
 183 #define DGEMV_N_4x2()            \
 184 {                                \
 185     LD_DP2(pa0 + k, 2, t0, t1);  \
 186     LD_DP2(pa1 + k, 2, t4, t5);  \
 187                                  \
 188     y0 += tp0 * t0;              \
 189     y1 += tp0 * t1;              \
 190                                  \
 191     y0 += tp1 * t4;              \
 192     y1 += tp1 * t5;              \
 193 }
 194
 195 #define DLOAD_X8_SCALE_GP()             \
 196    temp0 = alpha * x[0 * inc_x];        \
 197    temp1 = alpha * x[1 * inc_x];        \
 198    temp2 = alpha * x[2 * inc_x];        \
 199    temp3 = alpha * x[3 * inc_x];        \
 200    temp4 = alpha * x[4 * inc_x];        \
 201    temp5 = alpha * x[5 * inc_x];        \
 202    temp6 = alpha * x[6 * inc_x];        \
 203    temp7 = alpha * x[7 * inc_x];        \
 204                                         \
 205    tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
 206    tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
 207    tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
 208    tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
 209    tp4 = COPY_DOUBLE_TO_VECTOR(temp4);  \
 210    tp5 = COPY_DOUBLE_TO_VECTOR(temp5);  \
 211    tp6 = COPY_DOUBLE_TO_VECTOR(temp6);  \
 212    tp7 = COPY_DOUBLE_TO_VECTOR(temp7);  \
 213
 214 #define  DLOAD_X4_SCALE_GP()             \
 215     temp0 = alpha * x[0 * inc_x];        \
 216     temp1 = alpha * x[1 * inc_x];        \
 217     temp2 = alpha * x[2 * inc_x];        \
 218     temp3 = alpha * x[3 * inc_x];        \
 219                                          \
 220     tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
 221     tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
 222     tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
 223     tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
 224
 225 #define DLOAD_X8_SCALE_VECTOR()    \
 226     LD_DP4(x, 2, x0, x1, x2, x3);  \
 227                                    \
 228     x0 = x0 * v_alpha;             \
 229     x1 = x1 * v_alpha;             \
 230     x2 = x2 * v_alpha;             \
 231     x3 = x3 * v_alpha;             \
 232                                    \
 233     SPLATI_D2_DP(x0, tp0, tp1);    \
 234     SPLATI_D2_DP(x1, tp2, tp3);    \
 235     SPLATI_D2_DP(x2, tp4, tp5);    \
 236     SPLATI_D2_DP(x3, tp6, tp7);    \
 237
 238 #define DLOAD_X4_SCALE_VECTOR()  \
 239     LD_DP2(x, 2, x0, x1);        \
 240                                  \
 241     x0 = x0 * v_alpha;           \
 242     x1 = x1 * v_alpha;           \
 243                                  \
 244     SPLATI_D2_DP(x0, tp0, tp1);  \
 245     SPLATI_D2_DP(x1, tp2, tp3);  \
 246
 247 #define DLOAD_Y8_GP()                                                              \
 248     y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
 249     y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
 250     y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
 251     y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
 252     y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y)));  \
 253     y2 = (v2f64) __msa_insert_d((v2i64) y2,  1, *((long long *)(y + 5 * inc_y)));  \
 254     y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y)));  \
 255     y3 = (v2f64) __msa_insert_d((v2i64) y3,  1, *((long long *)(y + 7 * inc_y)));  \
 256
 257 #define DLOAD_Y4_GP()                                                              \
 258     y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
 259     y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
 260     y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
 261     y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
 262
 263 #define DLOAD_Y8_VECTOR()  LD_DP4(y, 2, y0, y1, y2, y3);
 264 #define DLOAD_Y4_VECTOR()  LD_DP2(y, 2, y0, y1);
 265
 266 #define DSTORE_Y8_GP()                                                \
 267     *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
 268     *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
 269     *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
 270     *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
 271     *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0);  \
 272     *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1);  \
 273     *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0);  \
 274     *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1);  \
 275
 276 #define DSTORE_Y4_GP()                                                \
 277     *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
 278     *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
 279     *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
 280     *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
 281
 282 #define DSTORE_Y8_VECTOR()  ST_DP4(y0, y1, y2, y3, y, 2);
 283 #define DSTORE_Y4_VECTOR()  ST_DP2(y0, y1, y, 2);
 284
 285 #define DGEMV_N_MSA()                        \
 286     for (j = (n >> 3); j--;)                 \
 287     {                                        \
 288         DLOAD_X8_SCALE();                    \
 289                                              \
 290         k = 0;                               \
 291         y = y_org;                           \
 292                                              \
 293         for (i = (m >> 3); i--;)             \
 294         {                                    \
 295             DLOAD_Y8();                      \
 296             DGEMV_N_8x8();                   \
 297             DSTORE_Y8();                     \
 298                                              \
 299             y += 8 * inc_y;                  \
 300             k += 8;                          \
 301         }                                    \
 302                                              \
 303         if (m & 4)                           \
 304         {                                    \
 305             DLOAD_Y4();                      \
 306             DGEMV_N_4x8();                   \
 307             DSTORE_Y4();                     \
 308                                              \
 309             y += 4 * inc_y;                  \
 310             k += 4;                          \
 311         }                                    \
 312                                              \
 313         if (m & 3)                           \
 314         {                                    \
 315             temp0 = alpha * x[0 * inc_x];    \
 316             temp1 = alpha * x[1 * inc_x];    \
 317             temp2 = alpha * x[2 * inc_x];    \
 318             temp3 = alpha * x[3 * inc_x];    \
 319             temp4 = alpha * x[4 * inc_x];    \
 320             temp5 = alpha * x[5 * inc_x];    \
 321             temp6 = alpha * x[6 * inc_x];    \
 322             temp7 = alpha * x[7 * inc_x];    \
 323                                              \
 324             for (i = (m & 3); i--;)          \
 325             {                                \
 326                 temp = y[0];                 \
 327                 temp += temp0 * pa0[k];      \
 328                 temp += temp1 * pa1[k];      \
 329                 temp += temp2 * pa2[k];      \
 330                 temp += temp3 * pa3[k];      \
 331                 temp += temp4 * pa4[k];      \
 332                 temp += temp5 * pa5[k];      \
 333                 temp += temp6 * pa6[k];      \
 334                 temp += temp7 * pa7[k];      \
 335                 y[0] = temp;                 \
 336                                              \
 337                 y += inc_y;                  \
 338                 k++;                         \
 339             }                                \
 340         }                                    \
 341         pa0 += 8 * lda;                      \
 342         pa1 += 8 * lda;                      \
 343         pa2 += 8 * lda;                      \
 344         pa3 += 8 * lda;                      \
 345         pa4 += 8 * lda;                      \
 346         pa5 += 8 * lda;                      \
 347         pa6 += 8 * lda;                      \
 348         pa7 += 8 * lda;                      \
 349                                              \
 350         x += 8 * inc_x;                      \
 351     }                                        \
 352                                              \
 353     if (n & 4)                               \
 354     {                                        \
 355         DLOAD_X4_SCALE();                    \
 356                                              \
 357         k = 0;                               \
 358         y = y_org;                           \
 359                                              \
 360         for (i = (m >> 3); i--;)             \
 361         {                                    \
 362             DLOAD_Y8();                      \
 363             DGEMV_N_8x4();                   \
 364             DSTORE_Y8();                     \
 365                                              \
 366             y += 8 * inc_y;                  \
 367             k += 8;                          \
 368         }                                    \
 369                                              \
 370         if (m & 4)                           \
 371         {                                    \
 372             DLOAD_Y4();                      \
 373             DGEMV_N_4x4();                   \
 374             DSTORE_Y4();                     \
 375                                              \
 376             y += 4 * inc_y;                  \
 377             k += 4;                          \
 378         }                                    \
 379                                              \
 380         if (m & 3)                           \
 381         {                                    \
 382             temp0 = alpha * x[0 * inc_x];    \
 383             temp1 = alpha * x[1 * inc_x];    \
 384             temp2 = alpha * x[2 * inc_x];    \
 385             temp3 = alpha * x[3 * inc_x];    \
 386                                              \
 387             for (i = (m & 3); i--;)          \
 388             {                                \
 389                 temp = y[0];                 \
 390                 temp += temp0 * pa0[k];      \
 391                 temp += temp1 * pa1[k];      \
 392                 temp += temp2 * pa2[k];      \
 393                 temp += temp3 * pa3[k];      \
 394                 y[0] = temp;                 \
 395                                              \
 396                 y += inc_y;                  \
 397                 k++;                         \
 398             }                                \
 399         }                                    \
 400                                              \
 401         pa0 += 4 * lda;                      \
 402         pa1 += 4 * lda;                      \
 403         pa2 += 4 * lda;                      \
 404         pa3 += 4 * lda;                      \
 405                                              \
 406         x += 4 * inc_x;                      \
 407     }                                        \
 408                                              \
 409     if (n & 2)                               \
 410     {                                        \
 411         temp0 = alpha * x[0 * inc_x];        \
 412         temp1 = alpha * x[1 * inc_x];        \
 413                                              \
 414         tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
 415         tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
 416                                              \
 417         k = 0;                               \
 418         y = y_org;                           \
 419                                              \
 420         for (i = (m >> 3); i--;)             \
 421         {                                    \
 422             DLOAD_Y8();                      \
 423             DGEMV_N_8x2();                   \
 424             DSTORE_Y8();                     \
 425                                              \
 426             y += 8 * inc_y;                  \
 427             k += 8;                          \
 428         }                                    \
 429                                              \
 430         if (m & 4)                           \
 431         {                                    \
 432             DLOAD_Y4();                      \
 433             DGEMV_N_4x2();                   \
 434             DSTORE_Y4();                     \
 435                                              \
 436             y += 4 * inc_y;                  \
 437             k += 4;                          \
 438         }                                    \
 439                                              \
 440         if (m & 3)                           \
 441         {                                    \
 442             temp0 = alpha * x[0 * inc_x];    \
 443             temp1 = alpha * x[1 * inc_x];    \
 444                                              \
 445             for (i = (m & 3); i--;)          \
 446             {                                \
 447                 temp = y[0];                 \
 448                 temp += temp0 * pa0[k];      \
 449                 temp += temp1 * pa1[k];      \
 450                 y[0] = temp;                 \
 451                                              \
 452                 y += inc_y;                  \
 453                 k++;                         \
 454             }                                \
 455         }                                    \
 456                                              \
 457         pa0 += 2 * lda;                      \
 458         pa1 += 2 * lda;                      \
 459                                              \
 460         x += 2 * inc_x;                      \
 461     }                                        \
 462                                              \
 463     if (n & 1)                               \
 464     {                                        \
 465         temp = alpha * x[0];                 \
 466                                              \
 467         k = 0;                               \
 468         y = y_org;                           \
 469                                              \
 470         for (i = m; i--;)                    \
 471         {                                    \
 472            y[0] += temp * pa0[k];            \
 473            y += inc_y;                       \
 474            k++;                              \
 475         }                                    \
 476     }                                        \
 477
 478 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
 479           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
 480           FLOAT *buffer)
 481 {
 482     BLASLONG i, j, k;
 483     FLOAT *y_org = y;
 484     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
 485     FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 486     v2f64 v_alpha;
 487     v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
 488     v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
 489     v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
 490     v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 491
 492     v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
 493
 494     pa0 = A;
 495     pa1 = A + lda;
 496     pa2 = A + 2 * lda;
 497     pa3 = A + 3 * lda;
 498     pa4 = A + 4 * lda;
 499     pa5 = A + 5 * lda;
 500     pa6 = A + 6 * lda;
 501     pa7 = A + 7 * lda;
 502
 503     if ((1 == inc_x) && (1 == inc_y))
 504     {
 505         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
 506         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
 507         #define DLOAD_Y8   DLOAD_Y8_VECTOR
 508         #define DLOAD_Y4   DLOAD_Y4_VECTOR
 509         #define DSTORE_Y8  DSTORE_Y8_VECTOR
 510         #define DSTORE_Y4  DSTORE_Y4_VECTOR
 511
 512         DGEMV_N_MSA();
 513
 514         #undef DLOAD_X8_SCALE
 515         #undef DLOAD_X4_SCALE
 516         #undef DLOAD_Y8
 517         #undef DLOAD_Y4
 518         #undef DSTORE_Y8
 519         #undef DSTORE_Y4
 520     }
 521     else if (1 == inc_y)
 522     {
 523         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
 524         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
 525         #define DLOAD_Y8   DLOAD_Y8_VECTOR
 526         #define DLOAD_Y4   DLOAD_Y4_VECTOR
 527         #define DSTORE_Y8  DSTORE_Y8_VECTOR
 528         #define DSTORE_Y4  DSTORE_Y4_VECTOR
 529
 530         DGEMV_N_MSA();
 531
 532         #undef DLOAD_X8_SCALE
 533         #undef DLOAD_X4_SCALE
 534         #undef DLOAD_Y8
 535         #undef DLOAD_Y4
 536         #undef DSTORE_Y8
 537         #undef DSTORE_Y4
 538     }
 539     else if (1 == inc_x)
 540     {
 541         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
 542         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
 543         #define DLOAD_Y8   DLOAD_Y8_GP
 544         #define DLOAD_Y4   DLOAD_Y4_GP
 545         #define DSTORE_Y8  DSTORE_Y8_GP
 546         #define DSTORE_Y4  DSTORE_Y4_GP
 547
 548         DGEMV_N_MSA();
 549
 550         #undef DLOAD_X8_SCALE
 551         #undef DLOAD_X4_SCALE
 552         #undef DLOAD_Y8
 553         #undef DLOAD_Y4
 554         #undef DSTORE_Y8
 555         #undef DSTORE_Y4
 556     }
 557     else
 558     {
 559         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
 560         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
 561         #define DLOAD_Y8   DLOAD_Y8_GP
 562         #define DLOAD_Y4   DLOAD_Y4_GP
 563         #define DSTORE_Y8  DSTORE_Y8_GP
 564         #define DSTORE_Y4  DSTORE_Y4_GP
 565
 566         DGEMV_N_MSA();
 567
 568         #undef DLOAD_X8_SCALE
 569         #undef DLOAD_X4_SCALE
 570         #undef DLOAD_Y8
 571         #undef DLOAD_Y4
 572         #undef DSTORE_Y8
 573         #undef DSTORE_Y4
 574     }
 575
 576     return(0);
 577 }