kernel/x86_64/zsymv_U_sse.S

   1 /*********************************************************************/
   2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
   3 /* All rights reserved.                                              */
   4 /*                                                                   */
   5 /* Redistribution and use in source and binary forms, with or        */
   6 /* without modification, are permitted provided that the following   */
   7 /* conditions are met:                                               */
   8 /*                                                                   */
   9 /*   1. Redistributions of source code must retain the above         */
  10 /*      copyright notice, this list of conditions and the following  */
  11 /*      disclaimer.                                                  */
  12 /*                                                                   */
  13 /*   2. Redistributions in binary form must reproduce the above      */
  14 /*      copyright notice, this list of conditions and the following  */
  15 /*      disclaimer in the documentation and/or other materials       */
  16 /*      provided with the distribution.                              */
  17 /*                                                                   */
  18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
  19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
  20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
  21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
  22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
  23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
  24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
  25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
  26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
  27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
  28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
  29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
  30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
  31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
  32 /*                                                                   */
  33 /* The views and conclusions contained in the software and           */
  34 /* documentation are those of the authors and should not be          */
  35 /* interpreted as representing official policies, either expressed   */
  36 /* or implied, of The University of Texas at Austin.                 */
  37 /*********************************************************************/
  38
  39 #define ASSEMBLER
  40 #include "common.h"
  41
  42 #ifdef ATOM
  43 #define PREFETCH        prefetcht0
  44 #define PREFETCHW       prefetcht0
  45 #define PREFETCHSIZE    (16 * 24)
  46 #endif
  47
  48 #ifdef CORE2
  49 #define PREFETCH        prefetcht0
  50 #define PREFETCHW       prefetcht0
  51 #define PREFETCHSIZE    (16 * 24)
  52 #endif
  53
  54 #if defined(PENRYN) || defined(DUNNINGTON)
  55 #define PREFETCH        prefetcht0
  56 #define PREFETCHW       prefetcht0
  57 #define PREFETCHSIZE    (16 * 24)
  58 #endif
  59
  60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
  61 #define PREFETCH        prefetcht0
  62 #define PREFETCHW       prefetcht0
  63 #define PREFETCHSIZE    (16 * 24)
  64 #endif
  65
  66 #ifdef PENTIUM4
  67 #define PREFETCH        prefetcht0
  68 #define PREFETCHW       prefetcht0
  69 #define PREFETCHSIZE    (16 * 28)
  70 #endif
  71
  72 #ifdef OPTERON
  73 #define PREFETCH        prefetch
  74 #define PREFETCHW       prefetchw
  75 #define PREFETCHSIZE    (16 * 12)
  76 #define movsd           movlpd
  77 #endif
  78
  79 #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  80 #define PREFETCH        prefetch
  81 #define PREFETCHW       prefetchw
  82 #define PREFETCHSIZE    (16 * 16)
  83 #endif
  84
  85 #ifdef NANO
  86 #define PREFETCH        prefetcht0
  87 #define PREFETCHW       prefetcht0
  88 #define PREFETCHSIZE    (16 * 24)
  89 #endif
  90
  91 #ifdef GENERIC
  92 #define PREFETCH        prefetcht0
  93 #define PREFETCHW       prefetcht0
  94 #define PREFETCHSIZE    (16 * 14)
  95 #endif
  96
  97 #ifndef WINDOWS_ABI
  98
  99 #define STACKSIZE       80
 100
 101 #define OLD_Y            8 + STACKSIZE(%rsp)
 102 #define OLD_INCY        16 + STACKSIZE(%rsp)
 103 #define OLD_BUFFER      24 + STACKSIZE(%rsp)
 104
 105 #define M         ARG1
 106 #define N         ARG2
 107 #define A         ARG3
 108 #define LDA       ARG4
 109 #define X         ARG5
 110 #define INCX      ARG6
 111
 112 #else
 113
 114 #define STACKSIZE       256
 115
 116 #define OLD_A            40 + STACKSIZE(%rsp)
 117 #define OLD_LDA          48 + STACKSIZE(%rsp)
 118 #define OLD_X            56 + STACKSIZE(%rsp)
 119 #define OLD_INCX         64 + STACKSIZE(%rsp)
 120 #define OLD_Y            72 + STACKSIZE(%rsp)
 121 #define OLD_INCY         80 + STACKSIZE(%rsp)
 122 #define OLD_BUFFER       88 + STACKSIZE(%rsp)
 123
 124 #define M         ARG1
 125 #define N         ARG2
 126 #define A         ARG4
 127 #define LDA       ARG3
 128 #define X         %rdi
 129 #define INCX      %rsi
 130 #endif
 131
 132 #define Y       %r10
 133 #define INCY    %r11
 134 #define BUFFER  %r12
 135
 136 #define TEMP    %rax
 137 #define I       %rax
 138 #define A1      %rbx
 139 #define A2      %rbp
 140 #define XX      %r13
 141 #define YY      %r14
 142 #define IS      %r15
 143 #define NEW_X   BUFFER
 144 #define NEW_Y   X
 145
 146 #define ALPHA_R  %xmm0
 147 #define ALPHA_I  %xmm1
 148
 149 #define xsum1  %xmm0
 150 #define xsum2  %xmm1
 151 #define xsum3  %xmm2
 152 #define xsum4  %xmm3
 153
 154 #define atemp1 %xmm4
 155 #define atemp2 %xmm5
 156 #define atemp3 %xmm6
 157 #define atemp4 %xmm7
 158
 159 #define xtemp1 %xmm8
 160 #define xtemp2 %xmm9
 161 #define a1     %xmm10
 162 #define a2     %xmm11
 163
 164 #define a3     %xmm12
 165 #define yy1    %xmm13
 166 #define xt1    %xmm14
 167 #define xt2    %xmm15
 168
 169 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
 170 #define MOVDDUP(a, b, c)        movddup a(b), c
 171 #define MOVDDUP2(a, b, c)       movddup a##b, c
 172 #else
 173 #define MOVDDUP(a, b, c)        movlpd  a(b), c;movhpd  a(b), c
 174 #define MOVDDUP2(a, b, c)       movlpd  a##b, c;movhpd  a##b, c
 175 #endif
 176
 177         PROLOGUE
 178         PROFCODE
 179
 180         subq    $STACKSIZE, %rsp
 181         movq    %rbx,  0(%rsp)
 182         movq    %rbp,  8(%rsp)
 183         movq    %r12, 16(%rsp)
 184         movq    %r13, 24(%rsp)
 185         movq    %r14, 32(%rsp)
 186         movq    %r15, 40(%rsp)
 187
 188 #ifdef WINDOWS_ABI
 189         movq    %rdi,    48(%rsp)
 190         movq    %rsi,    56(%rsp)
 191         movups  %xmm6,   64(%rsp)
 192         movups  %xmm7,   80(%rsp)
 193         movups  %xmm8,   96(%rsp)
 194         movups  %xmm9,  112(%rsp)
 195         movups  %xmm10, 128(%rsp)
 196         movups  %xmm11, 144(%rsp)
 197         movups  %xmm12, 160(%rsp)
 198         movups  %xmm13, 176(%rsp)
 199         movups  %xmm14, 192(%rsp)
 200         movups  %xmm15, 208(%rsp)
 201
 202         movq    OLD_A,     A
 203         movq    OLD_LDA,   LDA
 204         movq    OLD_X,     X
 205         movq    OLD_INCX,  INCX
 206
 207         movaps  %xmm2, %xmm0
 208         movaps  %xmm3, %xmm1
 209 #endif
 210
 211         movq    OLD_Y,     Y
 212         movq    OLD_INCY,   INCY
 213         movq    OLD_BUFFER, BUFFER
 214
 215         salq    $ZBASE_SHIFT, INCX
 216         salq    $ZBASE_SHIFT, INCY
 217         salq    $ZBASE_SHIFT, LDA
 218
 219         testq   M, M
 220         jle     .L999
 221
 222         negq    IS
 223         addq    M, IS
 224
 225         movq    IS,  TEMP
 226         imulq   LDA, TEMP
 227         addq    TEMP, A
 228
 229         pcmpeqb %xmm3,  %xmm3
 230         xorpd   %xmm2,  %xmm2
 231         pslld   $31,    %xmm3
 232         unpckhps %xmm3, %xmm2
 233
 234         shufps   $0, ALPHA_R, ALPHA_R
 235         shufps   $0, ALPHA_I, ALPHA_I
 236         movaps   ALPHA_I, %xmm3
 237
 238         unpcklps ALPHA_R, ALPHA_I
 239         unpcklps %xmm3,   ALPHA_R
 240         pxor     %xmm2,   ALPHA_R
 241
 242         movq    BUFFER, XX
 243
 244         movq    M,  %rax
 245         sarq    $2, %rax
 246         jle     .L02
 247         ALIGN_3
 248
 249 .L01:
 250         movsd   0 * SIZE(X), %xmm4
 251         addq    INCX, X
 252         movhps  0 * SIZE(X), %xmm4
 253         addq    INCX, X
 254         movsd   0 * SIZE(X), %xmm6
 255         addq    INCX, X
 256         movhps  0 * SIZE(X), %xmm6
 257         addq    INCX, X
 258
 259         movsldup %xmm4, %xmm3
 260         movshdup %xmm4, %xmm4
 261         movsldup %xmm6, %xmm5
 262         movshdup %xmm6, %xmm6
 263
 264         mulps   ALPHA_I, %xmm3
 265         mulps   ALPHA_R, %xmm4
 266         mulps   ALPHA_I, %xmm5
 267         mulps   ALPHA_R, %xmm6
 268
 269         addps   %xmm4, %xmm3
 270         addps   %xmm6, %xmm5
 271
 272         movaps  %xmm3,  4 * SIZE(XX)
 273         movaps  %xmm5, 12 * SIZE(XX)
 274
 275         shufps  $0xb1, %xmm3, %xmm3
 276         shufps  $0xb1, %xmm5, %xmm5
 277
 278         pxor    %xmm2, %xmm3
 279         pxor    %xmm2, %xmm5
 280
 281         movaps  %xmm3,  0 * SIZE(XX)
 282         movaps  %xmm5,  8 * SIZE(XX)
 283
 284         subq    $-16 * SIZE, XX
 285         decq    %rax
 286         jg      .L01
 287         ALIGN_3
 288
 289 .L02:
 290         testq   $2, M
 291         jle     .L03
 292
 293         movsd   0 * SIZE(X), %xmm4
 294         addq    INCX, X
 295         movhps  0 * SIZE(X), %xmm4
 296         addq    INCX, X
 297
 298         movsldup %xmm4, %xmm3
 299         movshdup %xmm4, %xmm4
 300
 301         mulps   ALPHA_I, %xmm3
 302         mulps   ALPHA_R, %xmm4
 303
 304         addps   %xmm4, %xmm3
 305
 306         movaps  %xmm3,  4 * SIZE(XX)
 307
 308         shufps  $0xb1, %xmm3, %xmm3
 309         pxor    %xmm2, %xmm3
 310         movaps  %xmm3,  0 * SIZE(XX)
 311
 312         subq    $-8 * SIZE, XX
 313         ALIGN_3
 314
 315 .L03:
 316         testq   $1, M
 317         jle     .L05
 318
 319         movsd   0 * SIZE(X), %xmm4
 320         addq    INCX, X
 321
 322         movsldup %xmm4, %xmm3
 323         movshdup %xmm4, %xmm4
 324
 325         mulps   ALPHA_I, %xmm3
 326         mulps   ALPHA_R, %xmm4
 327
 328         addps   %xmm4, %xmm3
 329
 330         movlps  %xmm3,  2 * SIZE(XX)
 331
 332         shufps  $0xb1, %xmm3, %xmm3
 333         pxor    %xmm2, %xmm3
 334         movlps  %xmm3,  0 * SIZE(XX)
 335
 336         subq    $-4 * SIZE, XX
 337         ALIGN_3
 338
 339 .L05:
 340         /* now we don't need original X */
 341         movq   Y, NEW_Y
 342
 343         addq   $512, XX
 344         andq   $-512, XX
 345
 346         cmpq   $2 * SIZE, INCY
 347         je    .L10
 348
 349         movq   Y,  YY
 350         movq   XX, NEW_Y
 351
 352         movq    M,  %rax
 353         sarq    $2, %rax
 354         jle     .L07
 355         ALIGN_3
 356
 357 .L06:
 358         movsd   0 * SIZE(YY), %xmm0
 359         addq    INCY, YY
 360         movhps  0 * SIZE(YY), %xmm0
 361         addq    INCY, YY
 362         movsd   0 * SIZE(YY), %xmm1
 363         addq    INCY, YY
 364         movhps  0 * SIZE(YY), %xmm1
 365         addq    INCY, YY
 366
 367         movaps  %xmm0, 0 * SIZE(XX)
 368         movaps  %xmm1, 8 * SIZE(XX)
 369
 370         addq    $8 * SIZE, XX
 371         decq    %rax
 372         jg      .L06
 373         ALIGN_3
 374
 375 .L07:
 376         movq    M, %rax
 377         andq    $3, %rax
 378         jle     .L10
 379         ALIGN_3
 380
 381 .L08:
 382         movsd   0 * SIZE(YY), %xmm0
 383         addq    INCY, YY
 384
 385         movlps  %xmm0, 0 * SIZE(XX)
 386
 387         addq    $2 * SIZE, XX
 388         decq    %rax
 389         jg      .L08
 390         ALIGN_3
 391
 392 .L10:
 393         movq     IS, I
 394         addq     $2, I
 395         cmpq     M,  I
 396         jg       .L20
 397         ALIGN_3
 398
 399 .L11:
 400         movq    A,  A1
 401         leaq    (A, LDA, 1), A2
 402         leaq    (A, LDA, 2), A
 403
 404         leaq    (, IS, 4), I
 405
 406         movsd    0 * SIZE(NEW_X, I, SIZE), atemp2
 407         movhps   4 * SIZE(NEW_X, I, SIZE), atemp2
 408         movsd    2 * SIZE(NEW_X, I, SIZE), atemp4
 409         movhps   6 * SIZE(NEW_X, I, SIZE), atemp4
 410
 411         pshufd   $0xcc, atemp2, atemp1
 412         pshufd   $0x99, atemp2, atemp2
 413         pshufd   $0xcc, atemp4, atemp3
 414         pshufd   $0x99, atemp4, atemp4
 415
 416         pxor            xsum1, xsum1
 417         pxor            xsum2, xsum2
 418         pxor            xsum3, xsum3
 419         pxor            xsum4, xsum4
 420
 421         movq            NEW_X, XX
 422         movq            NEW_Y, YY
 423
 424         movq    IS,  I
 425         sarq    $2,  I
 426         jle     .L15
 427         ALIGN_3
 428
 429 .L12:
 430         HALT
 431
 432         subq     $-16 * SIZE, XX
 433         addq     $  8 * SIZE, YY
 434         addq     $  8 * SIZE, A1
 435         addq     $  8 * SIZE, A2
 436
 437         decq     I
 438         jg       .L12
 439         ALIGN_3
 440
 441 .L15:
 442         testq   $2, IS
 443         jle     .L18
 444
 445         movsd    0 * SIZE(YY), yy1
 446         movhps   2 * SIZE(YY), yy1
 447
 448         movaps   0 * SIZE(XX), xtemp1
 449         movaps   4 * SIZE(XX), xtemp2
 450
 451         movsd    0 * SIZE(A1), a1
 452         movhps   2 * SIZE(A1), a1
 453
 454         movaps   xtemp1, xt1
 455         movaps   xtemp2, xt2
 456         mulps    a1, xt1
 457         mulps    a1, xt2
 458         addps    xt1, xsum1
 459         addps    xt2, xsum2
 460
 461         pshufd   $0xb1, a1, xt2
 462         mulps    atemp1, a1
 463         mulps    atemp2, xt2
 464         addps    a1,  yy1
 465         addps    xt2, yy1
 466
 467         movsd    0 * SIZE(A2), a1
 468         movhps   2 * SIZE(A2), a1
 469
 470         movaps   xtemp1, xt1
 471         movaps   xtemp2, xt2
 472         mulps    a1, xt1
 473         mulps    a1, xt2
 474         addps    xt1, xsum3
 475         addps    xt2, xsum4
 476
 477         pshufd   $0xb1, a1, xt2
 478         mulps    atemp1, a1
 479         mulps    atemp2, xt2
 480         addps     a1, yy1
 481         addps    xt2, yy1
 482
 483         movlps   yy1, 0 * SIZE(YY)
 484         movhps   yy1, 2 * SIZE(YY)
 485
 486         addq     $8 * SIZE, XX
 487         addq     $4 * SIZE, YY
 488         addq     $4 * SIZE, A1
 489         addq     $4 * SIZE, A2
 490         ALIGN_3
 491
 492 .L18:
 493         leaq    (, IS, 4), I
 494
 495         movaps   0 * SIZE(NEW_X, I, SIZE), atemp1
 496         movaps   4 * SIZE(NEW_X, I, SIZE), atemp2
 497
 498         movlps   0 * SIZE(YY), yy1
 499         movhps   2 * SIZE(YY), yy1
 500
 501         movsd    0 * SIZE(A1), a1
 502         movhps   0 * SIZE(A2), a1
 503
 504         movaps   a1, a2
 505         mulps    atemp1, a1
 506         mulps    atemp2, a2
 507         addps    a1, xsum1
 508         addps    a2, xsum2
 509
 510         movsd    0 * SIZE(A2), a1
 511         movhps   2 * SIZE(A2), a1
 512
 513         movaps   a1, a2
 514         mulps    atemp1, a1
 515         mulps    atemp2, a2
 516         addps    a1, xsum3
 517         addps    a2, xsum4
 518
 519         haddps   xsum2, xsum1
 520         haddps   xsum4, xsum3
 521
 522         haddps   xsum3, xsum1
 523         addps    xsum1, yy1
 524
 525         movlps   yy1, 0 * SIZE(YY)
 526         movhps   yy1, 2 * SIZE(YY)
 527
 528         addq     $2, IS
 529
 530         movq     IS, I
 531         addq     $2, I
 532         cmpq     M, I
 533         jle      .L11
 534         ALIGN_3
 535
 536 .L20:
 537         testq   $1, M
 538         jle     .L990
 539
 540
 541 .L990:
 542         cmpq   $2 * SIZE, INCY
 543         je    .L999
 544
 545         movq    M,  %rax
 546         sarq    $2, %rax
 547         jle     .L997
 548         ALIGN_3
 549
 550 .L996:
 551         movaps   0 * SIZE(NEW_Y), %xmm0
 552         movaps   4 * SIZE(NEW_Y), %xmm1
 553
 554         movlps  %xmm0,  0 * SIZE(Y)
 555         addq    INCY, Y
 556         movhps  %xmm0,  0 * SIZE(Y)
 557         addq    INCY, Y
 558         movlps  %xmm1,  0 * SIZE(Y)
 559         addq    INCY, Y
 560         movhps  %xmm1,  0 * SIZE(Y)
 561         addq    INCY, Y
 562
 563         addq    $8 * SIZE, NEW_Y
 564         decq    %rax
 565         jg      .L996
 566         ALIGN_3
 567
 568 .L997:
 569         movq    M, %rax
 570         andq    $3, %rax
 571         jle     .L999
 572         ALIGN_3
 573
 574 .L998:
 575         movlps  0 * SIZE(NEW_Y), %xmm0
 576         addq    $2 * SIZE, NEW_Y
 577
 578         movlps  %xmm0,  0 * SIZE(Y)
 579         addq    INCY, Y
 580
 581         decq    %rax
 582         jg      .L998
 583         ALIGN_3
 584
 585 .L999:
 586         movq      0(%rsp), %rbx
 587         movq      8(%rsp), %rbp
 588         movq     16(%rsp), %r12
 589         movq     24(%rsp), %r13
 590         movq     32(%rsp), %r14
 591         movq     40(%rsp), %r15
 592         addq    $STACKSIZE, %rsp
 593         ret
 594         EPILOGUE