kernel/mips/cgemv_n_msa.c

   1 /*******************************************************************************
   2 Copyright (c) 2016, The OpenBLAS Project
   3 All rights reserved.
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are
   6 met:
   7 1. Redistributions of source code must retain the above copyright
   8 notice, this list of conditions and the following disclaimer.
   9 2. Redistributions in binary form must reproduce the above copyright
  10 notice, this list of conditions and the following disclaimer in
  11 the documentation and/or other materials provided with the
  12 distribution.
  13 3. Neither the name of the OpenBLAS project nor the names of
  14 its contributors may be used to endorse or promote products
  15 derived from this software without specific prior written permission.
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 *******************************************************************************/
  27
  28 #include "common.h"
  29 #include "macros_msa.h"
  30
  31 #undef OP0
  32 #undef OP1
  33 #undef OP2
  34 #undef OP3
  35 #undef OP4
  36
  37 #if !defined(XCONJ)
  38     #define OP3  -=
  39     #define OP4  +=
  40 #else
  41     #define OP3  +=
  42     #define OP4  -=
  43 #endif
  44
  45 #if !defined(CONJ)
  46     #if !defined(XCONJ)
  47         #define OP0  -=
  48         #define OP1  +=
  49         #define OP2  +=
  50     #else
  51         #define OP0  +=
  52         #define OP1  +=
  53         #define OP2  -=
  54     #endif
  55 #else
  56     #if !defined(XCONJ)
  57         #define OP0  +=
  58         #define OP1  -=
  59         #define OP2  -=
  60     #else
  61         #define OP0  -=
  62         #define OP1  -=
  63         #define OP2  +=
  64     #endif
  65 #endif
  66
  67 #define CGEMV_N_8x4()                        \
  68     LD_SP4(pa0 + k, 4, t0, t1, t2, t3);      \
  69     LD_SP4(pa1 + k, 4, t4, t5, t6, t7);      \
  70     LD_SP4(pa2 + k, 4, t8, t9, t10, t11);    \
  71     LD_SP4(pa3 + k, 4, t12, t13, t14, t15);  \
  72                                              \
  73     PCKEVOD_W2_SP(t1, t0, src0r, src0i);     \
  74     PCKEVOD_W2_SP(t3, t2, src1r, src1i);     \
  75     PCKEVOD_W2_SP(t5, t4, src2r, src2i);     \
  76     PCKEVOD_W2_SP(t7, t6, src3r, src3i);     \
  77     PCKEVOD_W2_SP(t9, t8, src4r, src4i);     \
  78     PCKEVOD_W2_SP(t11, t10, src5r, src5i);   \
  79     PCKEVOD_W2_SP(t13, t12, src6r, src6i);   \
  80     PCKEVOD_W2_SP(t15, t14, src7r, src7i);   \
  81                                              \
  82     y0r += tp0r * src0r;                     \
  83     y1r += tp0r * src1r;                     \
  84     y0r += tp1r * src2r;                     \
  85     y1r += tp1r * src3r;                     \
  86     y0r += tp2r * src4r;                     \
  87     y1r += tp2r * src5r;                     \
  88     y0r += tp3r * src6r;                     \
  89     y1r += tp3r * src7r;                     \
  90                                              \
  91     y0r OP0 tp0i * src0i;                    \
  92     y1r OP0 tp0i * src1i;                    \
  93     y0r OP0 tp1i * src2i;                    \
  94     y1r OP0 tp1i * src3i;                    \
  95     y0r OP0 tp2i * src4i;                    \
  96     y1r OP0 tp2i * src5i;                    \
  97     y0r OP0 tp3i * src6i;                    \
  98     y1r OP0 tp3i * src7i;                    \
  99                                              \
 100     y0i OP1 tp0r * src0i;                    \
 101     y1i OP1 tp0r * src1i;                    \
 102     y0i OP1 tp1r * src2i;                    \
 103     y1i OP1 tp1r * src3i;                    \
 104     y0i OP1 tp2r * src4i;                    \
 105     y1i OP1 tp2r * src5i;                    \
 106     y0i OP1 tp3r * src6i;                    \
 107     y1i OP1 tp3r * src7i;                    \
 108                                              \
 109     y0i OP2 tp0i * src0r;                    \
 110     y1i OP2 tp0i * src1r;                    \
 111     y0i OP2 tp1i * src2r;                    \
 112     y1i OP2 tp1i * src3r;                    \
 113     y0i OP2 tp2i * src4r;                    \
 114     y1i OP2 tp2i * src5r;                    \
 115     y0i OP2 tp3i * src6r;                    \
 116     y1i OP2 tp3i * src7r;                    \
 117
 118 #define CGEMV_N_4x4()                       \
 119     LD_SP2(pa0 + k, 4, t0, t1);             \
 120     LD_SP2(pa1 + k, 4, t4, t5);             \
 121     LD_SP2(pa2 + k, 4, t8, t9);             \
 122     LD_SP2(pa3 + k, 4, t12, t13);           \
 123                                             \
 124     PCKEVOD_W2_SP(t1, t0, src0r, src0i);    \
 125     PCKEVOD_W2_SP(t5, t4, src2r, src2i);    \
 126     PCKEVOD_W2_SP(t9, t8, src4r, src4i);    \
 127     PCKEVOD_W2_SP(t13, t12, src6r, src6i);  \
 128                                             \
 129     y0r += tp0r * src0r;                    \
 130     y0r += tp1r * src2r;                    \
 131     y0r += tp2r * src4r;                    \
 132     y0r += tp3r * src6r;                    \
 133                                             \
 134     y0r OP0 tp0i * src0i;                   \
 135     y0r OP0 tp1i * src2i;                   \
 136     y0r OP0 tp2i * src4i;                   \
 137     y0r OP0 tp3i * src6i;                   \
 138                                             \
 139     y0i OP1 tp0r * src0i;                   \
 140     y0i OP1 tp1r * src2i;                   \
 141     y0i OP1 tp2r * src4i;                   \
 142     y0i OP1 tp3r * src6i;                   \
 143                                             \
 144     y0i OP2 tp0i * src0r;                   \
 145     y0i OP2 tp1i * src2r;                   \
 146     y0i OP2 tp2i * src4r;                   \
 147     y0i OP2 tp3i * src6r;                   \
 148
 149 #define CGEMV_N_1x4()               \
 150     res0 = y[0 * inc_y2];           \
 151     res1 = y[0 * inc_y2 + 1];       \
 152                                     \
 153     res0  += temp0_r * pa0[k];      \
 154     res0 OP0 temp0_i * pa0[k + 1];  \
 155     res0  += temp1_r * pa1[k];      \
 156     res0 OP0 temp1_i * pa1[k + 1];  \
 157     res0  += temp2_r * pa2[k];      \
 158     res0 OP0 temp2_i * pa2[k + 1];  \
 159     res0  += temp3_r * pa3[k];      \
 160     res0 OP0 temp3_i * pa3[k + 1];  \
 161                                     \
 162     res1 OP1 temp0_r * pa0[k + 1];  \
 163     res1 OP2 temp0_i * pa0[k];      \
 164     res1 OP1 temp1_r * pa1[k + 1];  \
 165     res1 OP2 temp1_i * pa1[k];      \
 166     res1 OP1 temp2_r * pa2[k + 1];  \
 167     res1 OP2 temp2_i * pa2[k];      \
 168     res1 OP1 temp3_r * pa3[k + 1];  \
 169     res1 OP2 temp3_i * pa3[k];      \
 170                                     \
 171     y[0 * inc_y2]     = res0;       \
 172     y[0 * inc_y2 + 1] = res1;       \
 173
 174 #define CGEMV_N_8x2()                     \
 175     LD_SP4(pa0 + k, 4, t0, t1, t2, t3);   \
 176     LD_SP4(pa1 + k, 4, t4, t5, t6, t7);   \
 177                                           \
 178     PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
 179     PCKEVOD_W2_SP(t3, t2, src1r, src1i);  \
 180     PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
 181     PCKEVOD_W2_SP(t7, t6, src3r, src3i);  \
 182                                           \
 183     y0r += tp0r * src0r;                  \
 184     y1r += tp0r * src1r;                  \
 185     y0r += tp1r * src2r;                  \
 186     y1r += tp1r * src3r;                  \
 187                                           \
 188     y0r OP0 tp0i * src0i;                 \
 189     y1r OP0 tp0i * src1i;                 \
 190     y0r OP0 tp1i * src2i;                 \
 191     y1r OP0 tp1i * src3i;                 \
 192                                           \
 193     y0i OP1 tp0r * src0i;                 \
 194     y1i OP1 tp0r * src1i;                 \
 195     y0i OP1 tp1r * src2i;                 \
 196     y1i OP1 tp1r * src3i;                 \
 197                                           \
 198     y0i OP2 tp0i * src0r;                 \
 199     y1i OP2 tp0i * src1r;                 \
 200     y0i OP2 tp1i * src2r;                 \
 201     y1i OP2 tp1i * src3r;                 \
 202
 203 #define CGEMV_N_4x2()                     \
 204     LD_SP2(pa0 + k, 4, t0, t1);           \
 205     LD_SP2(pa1 + k, 4, t4, t5);           \
 206                                           \
 207     PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
 208     PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
 209                                           \
 210     y0r += tp0r * src0r;                  \
 211     y0r += tp1r * src2r;                  \
 212                                           \
 213     y0r OP0 tp0i * src0i;                 \
 214     y0r OP0 tp1i * src2i;                 \
 215                                           \
 216     y0i OP1 tp0r * src0i;                 \
 217     y0i OP1 tp1r * src2i;                 \
 218                                           \
 219     y0i OP2 tp0i * src0r;                 \
 220     y0i OP2 tp1i * src2r;                 \
 221
 222 #define CGEMV_N_1x2()               \
 223     res0 = y[0 * inc_y2];           \
 224     res1 = y[0 * inc_y2 + 1];       \
 225                                     \
 226     res0  += temp0_r * pa0[k];      \
 227     res0 OP0 temp0_i * pa0[k + 1];  \
 228     res0  += temp1_r * pa1[k];      \
 229     res0 OP0 temp1_i * pa1[k + 1];  \
 230                                     \
 231     res1 OP1 temp0_r * pa0[k + 1];  \
 232     res1 OP2 temp0_i * pa0[k];      \
 233     res1 OP1 temp1_r * pa1[k + 1];  \
 234     res1 OP2 temp1_i * pa1[k];      \
 235                                     \
 236     y[0 * inc_y2]     = res0;       \
 237     y[0 * inc_y2 + 1] = res1;       \
 238
 239 #define CGEMV_N_1x1()              \
 240     res0 = y[0 * inc_y2];          \
 241     res1 = y[0 * inc_y2 + 1];      \
 242                                    \
 243     res0  += temp_r * pa0[k];      \
 244     res0 OP0 temp_i * pa0[k + 1];  \
 245                                    \
 246     res1 OP1 temp_r * pa0[k + 1];  \
 247     res1 OP2 temp_i * pa0[k];      \
 248                                    \
 249     y[0 * inc_y2]     = res0;      \
 250     y[0 * inc_y2 + 1] = res1;      \
 251
 252 #define CLOAD_X4_SCALE_VECTOR()                  \
 253     LD_SP2(x, 4, x0, x1);                        \
 254                                                  \
 255     PCKEVOD_W2_SP(x1, x0, x0r, x0i);             \
 256                                                  \
 257     tp4r   = alphar * x0r;                       \
 258     tp4r OP3 alphai * x0i;                       \
 259     tp4i   = alphar * x0i;                       \
 260     tp4i OP4 alphai * x0r;                       \
 261                                                  \
 262     SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);  \
 263     SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);  \
 264
 265 #define CLOAD_X4_SCALE_GP()                                                          \
 266     x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2)));      \
 267     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  1, *((int *) (x + 1 * inc_x2)));      \
 268     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  2, *((int *) (x + 2 * inc_x2)));      \
 269     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  3, *((int *) (x + 3 * inc_x2)));      \
 270     x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1)));  \
 271     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  1, *((int *) (x + 1 * inc_x2 + 1)));  \
 272     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  2, *((int *) (x + 2 * inc_x2 + 1)));  \
 273     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  3, *((int *) (x + 3 * inc_x2 + 1)));  \
 274                                                                                      \
 275     tp4r   = alphar * x0r;                                                           \
 276     tp4r OP3 alphai * x0i;                                                           \
 277     tp4i   = alphar * x0i;                                                           \
 278     tp4i OP4 alphai * x0r;                                                           \
 279                                                                                      \
 280     SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);                                      \
 281     SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);                                      \
 282
 283 #define CLOAD_X2_SCALE_GP()                        \
 284     temp0_r   = alpha_r * x[0 * inc_x2];           \
 285     temp0_r OP3 alpha_i * x[0 * inc_x2 + 1];       \
 286     temp0_i   = alpha_r * x[0 * inc_x2 + 1];       \
 287     temp0_i OP4 alpha_i * x[0 * inc_x2];           \
 288                                                    \
 289     temp1_r   = alpha_r * x[1 * inc_x2];           \
 290     temp1_r OP3 alpha_i * x[1 * inc_x2 + 1];       \
 291     temp1_i   = alpha_r * x[1 * inc_x2 + 1];       \
 292     temp1_i OP4 alpha_i * x[1 * inc_x2];           \
 293                                                    \
 294     tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r);  \
 295     tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i);  \
 296     tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r);  \
 297     tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i);  \
 298
 299 #define CLOAD_X1_SCALE_GP()                  \
 300     temp_r   = alpha_r * x[0 * inc_x2];      \
 301     temp_r OP3 alpha_i * x[0 * inc_x2 + 1];  \
 302     temp_i   = alpha_r * x[0 * inc_x2 + 1];  \
 303     temp_i OP4 alpha_i * x[0 * inc_x2];      \
 304
 305 #define CLOAD_Y8_VECTOR()             \
 306     LD_SP4(y, 4, y0, y1, y2, y3);     \
 307     PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
 308     PCKEVOD_W2_SP(y3, y2, y1r, y1i);  \
 309
 310 #define CLOAD_Y4_VECTOR()             \
 311     LD_SP2(y, 4, y0, y1);             \
 312     PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
 313
 314 #define CSTORE_Y8_VECTOR()          \
 315     ILVRL_W2_SP(y0i, y0r, y0, y1);  \
 316     ILVRL_W2_SP(y1i, y1r, y2, y3);  \
 317     ST_SP4(y0, y1, y2, y3, y, 4);   \
 318
 319 #define CSTORE_Y4_VECTOR()          \
 320     ILVRL_W2_SP(y0i, y0r, y0, y1);  \
 321     ST_SP2(y0, y1, y, 4);           \
 322
 323 #define CLOAD_Y8_GP()                                                               \
 324     y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2)));      \
 325     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y + 1 * inc_y2)));      \
 326     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y + 2 * inc_y2)));      \
 327     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y + 3 * inc_y2)));      \
 328     y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2)));      \
 329     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  1, *((int *)(y + 5 * inc_y2)));      \
 330     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  2, *((int *)(y + 6 * inc_y2)));      \
 331     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  3, *((int *)(y + 7 * inc_y2)));      \
 332     y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1)));  \
 333     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y + 1 * inc_y2 + 1)));  \
 334     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y + 2 * inc_y2 + 1)));  \
 335     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y + 3 * inc_y2 + 1)));  \
 336     y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1)));  \
 337     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  1, *((int *)(y + 5 * inc_y2 + 1)));  \
 338     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  2, *((int *)(y + 6 * inc_y2 + 1)));  \
 339     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  3, *((int *)(y + 7 * inc_y2 + 1)));  \
 340
 341 #define CLOAD_Y4_GP()                                                                \
 342     y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2)));      \
 343     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y +  1 * inc_y2)));      \
 344     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y +  2 * inc_y2)));      \
 345     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y +  3 * inc_y2)));      \
 346     y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2 + 1)));  \
 347     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y +  1 * inc_y2 + 1)));  \
 348     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y +  2 * inc_y2 + 1)));  \
 349     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y +  3 * inc_y2 + 1)));  \
 350
 351 #define CSTORE_Y8_GP()                                                \
 352     *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
 353     *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
 354     *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
 355     *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
 356     *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0);      \
 357     *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1);      \
 358     *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2);      \
 359     *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3);      \
 360     *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
 361     *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
 362     *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
 363     *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
 364     *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0);  \
 365     *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1);  \
 366     *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2);  \
 367     *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3);  \
 368
 369 #define CSTORE_Y4_GP()                                                \
 370     *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
 371     *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
 372     *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
 373     *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
 374     *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
 375     *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
 376     *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
 377     *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
 378
 379 #define CGEMV_N_MSA()                         \
 380     for (j = (n >> 2); j--;)                  \
 381     {                                         \
 382         CLOAD_X4_SCALE();                     \
 383                                               \
 384         k = 0;                                \
 385         k_pref = pref_offset;                 \
 386         y = y_org;                            \
 387                                               \
 388         for (i = (m >> 3); i--;)              \
 389         {                                     \
 390             PREFETCH(pa0 + k_pref + 16 + 0);  \
 391             PREFETCH(pa0 + k_pref + 16 + 8);  \
 392             PREFETCH(pa1 + k_pref + 16 + 0);  \
 393             PREFETCH(pa1 + k_pref + 16 + 8);  \
 394             PREFETCH(pa2 + k_pref + 16 + 0);  \
 395             PREFETCH(pa2 + k_pref + 16 + 8);  \
 396             PREFETCH(pa3 + k_pref + 16 + 0);  \
 397             PREFETCH(pa3 + k_pref + 16 + 8);  \
 398                                               \
 399             CLOAD_Y8()                        \
 400             CGEMV_N_8x4();                    \
 401             CSTORE_Y8();                      \
 402                                               \
 403             k += 2 * 8;                       \
 404             k_pref += 2 * 8;                  \
 405             y += inc_y2 * 8;                  \
 406         }                                     \
 407                                               \
 408         if (m & 4)                            \
 409         {                                     \
 410             CLOAD_Y4();                       \
 411             CGEMV_N_4x4();                    \
 412             CSTORE_Y4();                      \
 413                                               \
 414             k += 2 * 4;                       \
 415             y += inc_y2 * 4;                  \
 416         }                                     \
 417                                               \
 418         if (m & 3)                            \
 419         {                                     \
 420             temp0_r = tp4r[0];                \
 421             temp1_r = tp4r[1];                \
 422             temp2_r = tp4r[2];                \
 423             temp3_r = tp4r[3];                \
 424                                               \
 425             temp0_i = tp4i[0];                \
 426             temp1_i = tp4i[1];                \
 427             temp2_i = tp4i[2];                \
 428             temp3_i = tp4i[3];                \
 429                                               \
 430             for (i = (m & 3); i--;)           \
 431             {                                 \
 432                 CGEMV_N_1x4();                \
 433                                               \
 434                 k += 2;                       \
 435                 y += inc_y2;                  \
 436             }                                 \
 437         }                                     \
 438                                               \
 439         pa0 += 4 * lda2;                      \
 440         pa1 += 4 * lda2;                      \
 441         pa2 += 4 * lda2;                      \
 442         pa3 += 4 * lda2;                      \
 443                                               \
 444         x += 4 * inc_x2;                      \
 445     }                                         \
 446                                               \
 447     if (n & 2)                                \
 448     {                                         \
 449         CLOAD_X2_SCALE();                     \
 450                                               \
 451         k = 0;                                \
 452         y = y_org;                            \
 453                                               \
 454         for (i = (m >> 3); i--;)              \
 455         {                                     \
 456             CLOAD_Y8();                       \
 457             CGEMV_N_8x2();                    \
 458             CSTORE_Y8();                      \
 459                                               \
 460             k += 2 * 8;                       \
 461             y += inc_y2 * 8;                  \
 462         }                                     \
 463                                               \
 464         if (m & 4)                            \
 465         {                                     \
 466             CLOAD_Y4();                       \
 467             CGEMV_N_4x2();                    \
 468             CSTORE_Y4();                      \
 469                                               \
 470             k += 2 * 4;                       \
 471             y += inc_y2 * 4;                  \
 472         }                                     \
 473                                               \
 474         for (i = (m & 3); i--;)               \
 475         {                                     \
 476              CGEMV_N_1x2();                   \
 477                                               \
 478              k += 2;                          \
 479              y += inc_y2;                     \
 480         }                                     \
 481                                               \
 482         pa0 += 2 * lda2;                      \
 483         pa1 += 2 * lda2;                      \
 484                                               \
 485         x += 2 * inc_x2;                      \
 486     }                                         \
 487                                               \
 488     if (n & 1)                                \
 489     {                                         \
 490         CLOAD_X1_SCALE();                     \
 491                                               \
 492         k = 0;                                \
 493         y = y_org;                            \
 494                                               \
 495         for (i = m; i--;)                     \
 496         {                                     \
 497             CGEMV_N_1x1();                    \
 498                                               \
 499             k += 2;                           \
 500             y += inc_y2;                      \
 501         }                                     \
 502                                               \
 503         pa0 += lda2;                          \
 504         x += inc_x2;                          \
 505     }                                         \
 506
 507 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 508           FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
 509           BLASLONG inc_y2, FLOAT *buffer)
 510 {
 511     BLASLONG i, j, k, k_pref, pref_offset;
 512     FLOAT *y_org = y;
 513     FLOAT *pa0, *pa1, *pa2, *pa3;
 514     FLOAT temp_r, temp_i, res0, res1, temp0_r;
 515     FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
 516     v4f32 alphar, alphai;
 517     v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
 518     v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
 519     v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
 520     v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
 521     v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
 522
 523     lda2 = 2 * lda2;
 524     inc_x2 = 2 * inc_x2;
 525     inc_y2 = 2 * inc_y2;
 526
 527     pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
 528     pref_offset = L1_DATA_LINESIZE - pref_offset;
 529     pref_offset = pref_offset / sizeof(FLOAT);
 530
 531     pa0 = A;
 532     pa1 = A + lda2;
 533     pa2 = A + 2 * lda2;
 534     pa3 = A + 3 * lda2;
 535
 536     alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
 537     alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
 538
 539     if ((2 == inc_x2) && (2 == inc_y2))
 540     {
 541         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
 542         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
 543         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
 544         #define CLOAD_Y8        CLOAD_Y8_VECTOR
 545         #define CLOAD_Y4        CLOAD_Y4_VECTOR
 546         #define CSTORE_Y8       CSTORE_Y8_VECTOR
 547         #define CSTORE_Y4       CSTORE_Y4_VECTOR
 548
 549         CGEMV_N_MSA();
 550
 551         #undef CLOAD_X4_SCALE
 552         #undef CLOAD_X2_SCALE
 553         #undef CLOAD_X1_SCALE
 554         #undef CLOAD_Y8
 555         #undef CLOAD_Y4
 556         #undef CSTORE_Y8
 557         #undef CSTORE_Y4
 558     }
 559     else if (2 == inc_x2)
 560     {
 561         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
 562         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
 563         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
 564         #define CLOAD_Y8         CLOAD_Y8_GP
 565         #define CLOAD_Y4         CLOAD_Y4_GP
 566         #define CSTORE_Y8        CSTORE_Y8_GP
 567         #define CSTORE_Y4        CSTORE_Y4_GP
 568
 569         CGEMV_N_MSA();
 570
 571         #undef CLOAD_X4_SCALE
 572         #undef CLOAD_X2_SCALE
 573         #undef CLOAD_X1_SCALE
 574         #undef CLOAD_Y8
 575         #undef CLOAD_Y4
 576         #undef CSTORE_Y8
 577         #undef CSTORE_Y4
 578     }
 579     else if (2 == inc_y2)
 580     {
 581         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
 582         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
 583         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
 584         #define CLOAD_Y8        CLOAD_Y8_VECTOR
 585         #define CLOAD_Y4        CLOAD_Y4_VECTOR
 586         #define CSTORE_Y8       CSTORE_Y8_VECTOR
 587         #define CSTORE_Y4       CSTORE_Y4_VECTOR
 588
 589         CGEMV_N_MSA();
 590
 591         #undef CLOAD_X4_SCALE
 592         #undef CLOAD_X2_SCALE
 593         #undef CLOAD_X1_SCALE
 594         #undef CLOAD_Y8
 595         #undef CLOAD_Y4
 596         #undef CSTORE_Y8
 597         #undef CSTORE_Y4
 598     }
 599     else
 600     {
 601         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
 602         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
 603         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
 604         #define CLOAD_Y8        CLOAD_Y8_GP
 605         #define CLOAD_Y4        CLOAD_Y4_GP
 606         #define CSTORE_Y8       CSTORE_Y8_GP
 607         #define CSTORE_Y4       CSTORE_Y4_GP
 608
 609         CGEMV_N_MSA();
 610
 611         #undef CLOAD_X4_SCALE
 612         #undef CLOAD_X2_SCALE
 613         #undef CLOAD_X1_SCALE
 614         #undef CLOAD_Y8
 615         #undef CLOAD_Y4
 616         #undef CSTORE_Y8
 617         #undef CSTORE_Y4
 618     }
 619     return(0);
 620 }
 621
 622 #undef OP0
 623 #undef OP1
 624 #undef OP2
 625 #undef OP3
 626 #undef OP4